def process(dataset, file_names=None): files_counter = {} input_path = f'data/raw/{dataset}/' output_path = f'data/clean/{dataset}/' for inp_file, out_file in list_files(input_path, output_path, file_names): with open(inp_file) if not args.byte else open(inp_file, 'rb') as ifile: data = getattr(sys.modules[__name__], dataset)(ifile) data.to_file(out_file) files_counter[basename(out_file)] = data.qids # logging.info(files_counter) pickle.dump(files_counter, open(create_path(f'data/info/{dataset}/qids.pkl'), 'wb'))
def trainer(name, config, dataset): if name is None: experiment_path = train_utils.timestamp_dir("results") else: experiment_path = f'{name}' with open(utils.create_path(f'{experiment_path}/config.json'), 'w') as conff: json.dump(config, conff) logging.info(f'saving experiment in: {experiment_path}') train_utils.set_seed(config['seed']) logging.info('Loading embeddings..') vocab = Embeddings(f"data/embs/{dataset}/{config['embeddings']}.txt") logging.info('Initializing Net..') device = 'cuda' model = models.Model.by_name(config['model']['name'])( config['model']['params'], vocab, device).to(device) text_parser = models.Parser.by_name(config['parser'])(vocab) train_model = models.Trainer.by_name(config['train_as'])(text_parser, model) train_data = QAdataset(f'data/parsed/{dataset}/train.json') valid_data = QAdataset(f'data/parsed/{dataset}/dev.json') test_data = QAdataset(f'data/parsed/{dataset}/test.json') optimizer = getattr(torch.optim, config['optimizer']['name'])( model.trainable_parameters(), **config['optimizer']['params']) train_model.fit( train_data, optimizer, validation=valid_data, save_point=f"{experiment_path}/{config['model']['name']}_test.pt", patience=config['patience'], batch_size=config['batch_size'], intervals=100) valid_pred = train_model.predict(valid_data) valid_pred.to_file(f"{experiment_path}/dev.json") test_pred = train_model.predict(test_data) test_pred.to_file(f"{experiment_path}/test.json") valid_metrics = reranking.evaluate(valid_pred, 0.5) test_metrics = reranking.evaluate(test_pred, 0.5) with open(f"{experiment_path}/valid_metrics_0.5.json", 'w') as f: json.dump(valid_metrics, f) with open(f"{experiment_path}/test_metrics_0.5.json", 'w') as f: json.dump(test_metrics, f) logging.info( f'Results on the valid set at treshold 0.5:\n{train_utils.print_metrics(valid_metrics)}' ) logging.info( f'Results on the test set at treshold 0.5: \n{train_utils.print_metrics(test_metrics)}' ) max_f1 = 0 best_th = 0 for i in range(1, 100): th = 1 / 100 * i f1 = reranking.f1(valid_pred, th) if f1 > max_f1: max_f1 = f1 best_th = th valid_metrics = reranking.evaluate(valid_pred, best_th) test_metrics = reranking.evaluate(test_pred, best_th) with open(f"{experiment_path}/valid_metrics_best.json", 'w') as f: json.dump(valid_metrics, f) with open(f"{experiment_path}/test_metrics_best.json", 'w') as f: json.dump(test_metrics, f) logging.info( f'Results on the validation set at treshold {best_th}:\n{train_utils.print_metrics(valid_metrics)}' ) logging.info( f'Results on the test set at treshold {best_th}: \n{train_utils.print_metrics(test_metrics)}' )
def to_file(self, filename): with open(create_path(filename), 'w') as out: out.writelines(self.iterator(lambda x: f"{x.to_json()}\n"))
parser.add_argument("--only_train", dest='onlytrain', help="lowercased", action='store_true') parser.add_argument('--lower', dest='lower', help="lowercased", action='store_true') parser.add_argument('--top_n', dest='n', help="the max number of words to keep", type=int) args = parser.parse_args() def process(dataset, lower=True): datasets = [] inp_path = f'data/parsed/{dataset}/' for inp_file, _ in list_files(inp_path, inp_path): if not args.onlytrain or inp_file.endswith('train.json'): datasets.append(QAdataset(inp_file)) return Counter( chain(*(giff_words(dataset, lower) for dataset in datasets))) vocabulary = process(args.dataset, args.lower) with open(create_path(f'data/info/{args.dataset}/vocab.tsv'), 'w') as ofile: top_n = vocabulary.most_common(args.n) logging.info(f'{args.dataset}: {len(top_n)} words') for word, freq in vocabulary.most_common(args.n): ofile.write(f'{word}\t{freq}\n')
logging.info(f'Stats Mean {mean} Std {std} Dim {dim}') logging.info(f'Original Vocab: {len(vocab)}') logging.info(f'Embedding Vocab: {len(w2v.vocab)}') filtered_w2v = dict(filter_embeddings(vocab, w2v)) logging.info(f'Filtered Vocab: {len(filtered_w2v)}') filtered_w2v['PAD'] = np.zeros((dim, )) filtered_w2v['UNK'] = np.random.normal(mean, std, (dim, )) else: vocab_set = set(word for word, _ in vocab) w2v = load_w2v_fast(f'embs/{args.embeddings}', vocab_set) mean, std, dim = emb_stats(w2v) logging.info(f'Stats Mean {mean} Std {std} Dim {dim}') logging.info(f'Original Vocab: {len(vocab)}') logging.info(f'Embedding Vocab: {len(w2v)}') filtered_w2v = dict(filter_embeddings(vocab, w2v)) logging.info(f'Filtered Vocab: {len(filtered_w2v)}') filtered_w2v['PAD'] = np.zeros((dim, )) filtered_w2v['UNK'] = np.random.normal(mean, std, (dim, )) logging.info('Saving file...') with open( create_path( f'data/embs/{args.dataset}/{args.embeddings[:-4]}.txt'), 'w') as ofile: ofile.write(f'{len(filtered_w2v)} {dim}\n') for word, emb in filtered_w2v.items(): ofile.write(f"{word} {' '.join(str(val) for val in emb)}\n")