Пример #1
0
def main():
    global model_path, train_dataset_path, test_dataset_path

    model = {}

    while True:
        print("======= MENU =======")
        print("\t1. Treinar Modelo")
        print("\t2. Salvar Modelo")
        print("\t3. Carregar Modelo")
        print("\t4. Testar Modelo")
        print("\t0. Sair")
        opt = input()
        break_lines()
        if opt == '1':
            train_dataset = utils.parse_dataset(train_dataset_path)
            model = train(train_dataset)
        elif opt == '2':
            if not model:
                print("> Você precisa ter um modelo treinado para isso!")
            else:
                save(model, model_path)
        elif opt == '3':
            model = load(model_path)
        elif opt == '4':
            if not model:
                print("> Você precisa ter um modelo treinado para isso!")
            else:
                test_dataset = utils.parse_dataset(test_dataset_path)
                test_main(model, test_dataset)
        elif opt == '0':
            break
        else:
            print("- OPÇÃO INVÁLIDA -")
Пример #2
0
def get_data(config):
    df, sparse_cols, num_train = utils.get_dataset_df(config.data_dir,
                                                      debug=config.is_debug)
    print('cols:', ', '.join(df.columns))
    print('Sparse_cols:', sparse_cols)
    ignore_cols = ['target']
    dense_cols = [
        col for col in df.columns if col not in sparse_cols + ignore_cols
    ]
    sparse_cols.sort()
    dense_cols.sort()
    # print('Get embeddings ....')
    # # embedding = {col:utils.binary_embedding(df[col].max()+1) for col in sparse_cols}
    # embedding = utils.load_embedding(df, sparse_cols, config.data_dir, method=config.embedding_method)
    # print('Processing embeddings ....')
    # x = utils.process_embedding(df, dense_cols, embedding)

    feat_dict, feat_dim = utils.gen_feat_dict(df, sparse_cols, ignore_cols)
    Xi, Xv = utils.parse_dataset(df, feat_dict, sparse_cols, ignore_cols)

    idx0 = df[df['target'] == 0].index.values
    idx1 = df[df['target'] == 1].index.values
    del df
    gc.collect()

    x_neg_train_i = Xi[idx0, :]
    x_pos_train_i = Xi[idx1, :]
    x_neg_train_v = Xv[idx0, :]
    x_pos_train_v = Xv[idx1, :]
    x_test_i = Xi[num_train:]
    x_test_v = Xv[num_train:]
    x_neg_train = (x_neg_train_i, x_neg_train_v)
    x_pos_train = (x_pos_train_i, x_pos_train_v)
    x_test = (x_test_i, x_test_v)
    return x_neg_train, x_pos_train, x_test, feat_dim
Пример #3
0
    def __init__(self, audio_conf, manifest_filepath, metadata_file_path, labels, normalize=False, speed_volume_perturb=False, spec_augment=False):
        """
        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
        a comma. Each new line is a different sample. Example below:

        /path/to/audio.wav,/path/to/audio.txt
        ...

        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
        :param metadata_file_path: Path to manifest csv as describe above
        :param labels: String containing all the possible characters to map to
        :param normalize: Apply standard mean and deviation normalization to audio tensor
        :param speed_volume_perturb(default False): Apply random tempo and gain perturbations
        :param spec_augment(default False): Apply simple spectral augmentation to mel spectograms
        """

        ids = parse_dataset(metadata_file_path, manifest_filepath)

        self.ids = ids
        self.size = len(ids)
        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
        super(SpectrogramDataset, self).__init__(audio_conf, normalize, speed_volume_perturb, spec_augment)
from sklearn.metrics import average_precision_score

from baseline import run as run_baseline
from naive_bayes import run as run_nb
from random_forest import run as run_rf
from SVM_classification import run as run_svm
from fully_connected import run as run_nnet

from utils import parse_dataset
from utils import get_training_set
from utils import run_with_cv

if __name__ == '__main__':
    logging.basicConfig(level='INFO')
    with open('positive_set.json', 'r') as f:
        positive_set, positive_meta = parse_dataset(json.load(f))

    with open('negative_set.json', 'r') as f:
        negative_set, negative_meta = parse_dataset(json.load(f))

    seed = 1
    methods = [
        ('baseline', run_baseline, 'navy'),
        ('naive bayes', run_nb, 'turquoise'),
        ('random forest', run_rf, 'darkorange'),
        ('svm', run_svm, 'cornflowerblue'),
        ('neural net', run_nnet, 'teal'),
    ]

    X, y, meta = get_training_set(positive_set, positive_meta, negative_set,
                                  negative_meta, seed)
Пример #5
0
def main(config=None):

    results = {}
    if config is None:
        # create instance of config
        config = Config()
    eng_path = os.path.join(
        'parsed_data_lowercased',
        'eng_test_bio_bpe{}.txt'.format('1' if config.pos_target else ''))
    ger_path = os.path.join(
        'parsed_data_lowercased',
        'ger_test_bio_bpe{}.txt'.format('1' if config.pos_target else ''))
    ned_path = os.path.join(
        'parsed_data_lowercased',
        'ned_test_bio_bpe{}.txt'.format('1' if config.pos_target else ''))
    spa_path = os.path.join(
        'parsed_data_lowercased',
        'esp_test_bio_bpe{}.txt'.format('1' if config.pos_target else ''))
    data_filepaths = [
        eng_path,
        ned_path,
        # spa_path,
        ger_path,
    ]
    for data_filepath in data_filepaths:
        # get dataset
        encoding = 'utf-8'
        data_laser, pad_len = parse_dataset_laser(data_filepath,
                                                  config.label_to_idx,
                                                  config.word_to_idx,
                                                  pos_target=config.pos_target,
                                                  encoding=encoding)
        data, pad_len = parse_dataset(data_filepath,
                                      config.label_to_idx,
                                      config.word_to_idx,
                                      pos_target=config.pos_target,
                                      encoding=encoding)

        #####################################################################
        # SETUP
        #####################################################################
        subfolder = 'POS' if config.pos_target else 'NER'
        langfolder = config.langfolder
        base_path = os.path.join('saves_lc', langfolder, subfolder,
                                 'LASEREmbedderBase.pt')
        base_gru_path = os.path.join('saves_lc', langfolder, subfolder,
                                     'LASEREmbedderBaseGRU.pt')
        i_path = os.path.join('saves_lc', langfolder, subfolder,
                              'LASEREmbedderI.pt')
        iii_path = os.path.join('saves_lc', langfolder, subfolder,
                                'LASEREmbedderIII.pt')
        elmo_path = os.path.join('saves_lc', langfolder, subfolder,
                                 'LASEREmbedderIIIELMo.pt')

        paths = [
            # base_path,
            # base_gru_path,
            i_path,
            # iii_path,
            # elmo_path
        ]

        embedders = [
            # LASEREmbedderBase, #(config.model_path, pad_len),
            # LASEREmbedderBaseGRU, #(config.model_path, pad_len),
            LASEREmbedderI,  #(config.model_path),
            # LASEREmbedderIII, #(config.model_path),
            # LASEREmbedderIIIELMo, #(config.model_path)
        ]

        use_laser = [
            # False,
            # False,
            True,
            # True,
            # True
        ]
        lang_results = {}

        for embedder, path, d in zip(embedders, paths, use_laser):
            emb = embedder(config.model_path,
                           bpe_pad_len=pad_len,
                           static_lstm=config.static_lstm,
                           drop_before=config.drop_before_laser,
                           drop_after=config.drop_after_laser,
                           drop_within=config.drop_within_lstm)
            print(path)
            dset = data_laser if d else data
            f1 = eval_model_dataset(config, emb, dset, pad_len, path, d)
            lang_results[emb.__class__.__name__] = f1
            del emb
            empty_cache()

        results[data_filepath] = lang_results
    print(results)
    return results, config
Пример #6
0
import os
import pandas as pd
import utils

KAGGLE_DATA_ROOT = os.path.abspath('/Users/travisclarke/kaggle-data')
ANNOTATIONS_FILE = os.path.join(KAGGLE_DATA_ROOT, 'stage_1_train_labels.csv')
TRAIN_DIR = os.path.join(KAGGLE_DATA_ROOT, 'stage_1_train_images')
TRAIN_ANNOT_DIR = os.path.join(KAGGLE_DATA_ROOT, 'stage_1_train_annots')

# load and parse images and annotations
raw_annotations = pd.read_csv(ANNOTATIONS_FILE)
images = utils.parse_dataset(raw_annotations)

utils.cvt_annots_to_xml(images, TRAIN_ANNOT_DIR, TRAIN_DIR)
Пример #7
0
def main(config=None, embedders_to_train=None):
    # create instance of config
    if config is None:
        config = Config()
    if embedders_to_train is None:
        embedders_to_train = [
            # 'LASEREmbedderBase',
            #             # 'LASEREmbedderBaseGRU',
            'LASEREmbedderI',
            # 'LASEREmbedderIII',
            # 'LASEREmbedderIIIELMo',
        ]

    encoding = 'utf-8'
    static_lstm = False

    # parse datasets
    train_laser, tr_pad_len = parse_dataset_laser(config.filename_train,
                                                  config.label_to_idx,
                                                  config.word_to_idx,
                                                  pos_target=config.pos_target,
                                                  encoding=encoding)
    dev_laser, dev_pad_len = parse_dataset_laser(config.filename_dev,
                                                 config.label_to_idx,
                                                 config.word_to_idx,
                                                 pos_target=config.pos_target,
                                                 encoding=encoding)
    # else:
    train_base, tr_pad_len = parse_dataset(config.filename_train,
                                           config.label_to_idx,
                                           config.word_to_idx,
                                           pos_target=config.pos_target,
                                           encoding=encoding)
    dev_base, dev_pad_len = parse_dataset(config.filename_dev,
                                          config.label_to_idx,
                                          config.word_to_idx,
                                          pos_target=config.pos_target,
                                          encoding=encoding)
    # # build model
    embedder_base = LASEREmbedderBase  #(config.model_path, tr_pad_len)
    embedder_base_gru = LASEREmbedderBaseGRU  #(config.model_path, tr_pad_len)
    embedderI = LASEREmbedderI  #(config.model_path, static_lstm = False)
    embedderIII = LASEREmbedderIII  #(config.model_path, static_lstm = False)
    embedderIIIElmo = LASEREmbedderIIIELMo

    embedders = {
        'LASEREmbedderBase': embedder_base,
        'LASEREmbedderBaseGRU': embedder_base_gru,
        'LASEREmbedderI': embedderI,
        'LASEREmbedderIII': embedderIII,
        'LASEREmbedderIIIELMo': embedderIIIElmo
    }
    # model_name = {
    #     embedder_base:'LASEREmbedderBase',
    #     embedder_base_gru:'LASEREmbedderBaseGRU',
    #     embedderI:'LASEREmbedderI',
    #     embedderIII:'LASEREmbedderIII',
    #     embedderIIIElmo:'LASEREmbedderIIIELMo',
    # }

    use_laser = {
        'LASEREmbedderBase': False,
        'LASEREmbedderBaseGRU': False,
        'LASEREmbedderI': True,
        'LASEREmbedderIII': True,
        'LASEREmbedderIIIELMo': True
    }

    for embedder in embedders_to_train:

        # set output filename
        laser = use_laser[embedder]
        config.set_model_name(embedder)
        config.use_laser = laser
        # config.set_params(laser)
        train = train_laser if laser else train_base
        dev = dev_laser if laser else dev_base
        model = embedders[embedder](config.model_path,
                                    bpe_pad_len=tr_pad_len,
                                    static_lstm=static_lstm,
                                    drop_before=config.drop_before_laser,
                                    drop_after=config.drop_after_laser,
                                    drop_within=config.drop_in_laser)

        # try:
        fit(config, model, tr_pad_len, dev_pad_len, train, dev)
        del model
        empty_cache()
        time.sleep(60)  # free up CUDA memory
Пример #8
0
parser.add_argument('--graph_path',
                    help='Path to pickle file of citation graph')
parser.add_argument('--year_dict_path',
                    help='Path to pickle file of Publication year Dictionary')

args = parser.parse_args()
logging.StreamHandler(sys.stdout)
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S',
                    level=logging.INFO)

if __name__ == '__main__':
    global_citation_graph = ''
    if args.graph_path is None:
        logging.info('Parsing Dataset')
        global_citation_graph = utils.parse_dataset(args.dataset)
        utils.dump_file(args.dumps, 'global_citation_graph_full',
                        global_citation_graph)
    else:
        logging.info('Getting Pickle Graph')
        global_citation_graph = utils.get_pickle_dump(args.graph_path,
                                                      'global_citation_graph')

    paper_year_dict = ' '
    if args.year_dict_path is None:
        logging.info('Parsing Dates from dataset')
        paper_year_dict = utils.parse_dates(args.dataset)
        logging.info('Serialising Paper- Year Dictionary')
        utils.dump_file(args.dumps, 'paper_year_dict', paper_year_dict)
    else:
        logging.info('Unpickling the Date Dictionary')