def create_model(self, fname, max_news=99, n_proc=1, window=5, splits=100): name = clean_name(fname) model = word2vec.Word2Vec(window=window, workers=n_proc) if name == 'text8': sentences = word2vec.Text8Corpus(os.path.join('res', 'model', 'text8')) model.train(sentences) elif name == 'brown': # sentences = word2vec.BrownCorpus(fpath) sentences = brown.sents() model.train(sentences) elif name.startswith('news'): target_fpath = os.path.join('res', 'model', name+'.txt') if not os.path.exists(target_fpath): build_news_corpus(name, max_news, n_proc, target_fpath) sentences = word2vec.LineSentence(target_fpath) model.build_vocab(sentences) model.train(sentences) # elif name.startswith('wikipedia.deps'): # target_fpath = os.path.join('res', 'model', name+'.txt') # if not os.path.exists(target_fpath): # build_wikipedia_corpus(name, max_news, n_proc, target_fpath) elif name.startswith('spanishEtiquetado'): target_fpath = os.path.join('res', 'model', name+'.txt') if not os.path.exists(target_fpath): path = os.path.join('res', 'model', 'spanishEtiquetado') max_pos_len = re.search('\d+', name) if max_pos_len: max_pos_len = int(max_pos_len.group(0)) build_corpus(path, name.endswith('pos'), target_fpath, max_pos_len) sentences = word2vec.LineSentence(target_fpath) # with open(target_fpath) as fp: # sentences = fp.readlines() model.build_vocab(sentences) model.train(sentences) else: target_fpath = os.path.join('res', 'model', name+'.txt') file_to_lower(target_fpath) sentences = word2vec.LineSentence(target_fpath) model.build_vocab(sentences) model.train(sentences) # n_sents = len(sentences) # print(n_sents) # if splits == 0: # splits = 1 # split_size = int(n_sents/splits) # for i in range(splits): # print(str(i) + '\r') # split_sentences = sentences[i*split_size:(i+1)*split_size-1] # model.save_word2vec_format(os.path.join('res', 'model', fname), binary=fname.endswith('.bin')) # model.save() # model.save(os.path.join('res',name+'.model')) model.save_word2vec_format(os.path.join('res', 'model', fname), binary=fname.endswith('.bin'))
def main(): parser = options.get_parser('Trainer') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() # checkpoint checkpoint_dir = os.path.dirname(args.checkpoint) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus] start_epoch = 0 caseless = args.caseless batch_size = args.batch_size num_epoch = args.num_epoch # preprocessing sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) ## # target_map = {c:i for i, c in enumerate(['null', 'true'])} target_map = ddi2013.target_map train_features, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) val_features, val_targets = utils.build_corpus(val_corpus, feature_map, target_map, caseless) test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None train_loader = utils.construct_bucket_dataloader(train_features, train_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=True) val_loader = utils.construct_bucket_dataloader(val_features, val_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) print('Preprocessing done! Vocab size: {}'.format(len(feature_map))) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = utils.build_model(args, vocab_size, tagset_size) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) start_epoch = checkpoint_file['epoch'] + 1 model.load_state_dict(checkpoint_file['state_dict']) # optimizer.load_state_dict(checkpoint_file['optimizer']) else: print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint)) if not args.rand_embedding: pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim) print(pretrained_word_embedding.size()) print(vocab_size) model.load_pretrained_embedding(pretrained_word_embedding) if args.disable_fine_tune: model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words model.rand_init(init_embedding=args.rand_embedding) # trainer trainer = SeqTrainer(args, model, criterion) if os.path.isfile(args.load_checkpoint): dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format( dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1)) track_list = [] best_f1 = float('-inf') patience_count = 0 start_time = time.time() for epoch in range(start_epoch, num_epoch): epoch_loss = train(train_loader, trainer, epoch) # update lr trainer.lr_step() dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda) if dev_f1 >= best_f1: patience_count = 0 best_f1 = dev_f1 test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) track_list.append({'epoch': epoch, 'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) try: utils.save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': trainer.optimizer.state_dict(), 'f_map': feature_map, 't_map': target_map, }, {'track_list': track_list, 'args': vars(args) }, args.checkpoint + '_lstm') except Exception as inst: print(inst) else: patience_count += 1 track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss)) print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time)) if patience_count >= args.patience: break
def train_model_gensim_cross_validation(authors, label_type, pipeline, config="", token_level="word", verbose=1): ''' Takes a doc2vec model and trains it on the specified corpus. Takes a classifier and trains it on the doc2vec model vectors. Processes a cross-validation algorithm (K-fold) in order to evaluate the quality of the overall model. Returns the best trained pipeline (in terms of macro f-score). ''' labels = get_labels(lang=authors[0]["lang"], label_type=label_type) if not (labels): abort_clean("Could not extract labels") if verbose: print("Labels extraction succeded.") print("Available labels : " + " / ".join(labels) + "\n") if verbose: t0 = time() print("Starting model Cross Validation ... (this may take some time)") # load doc2vec conf conf = [] if config: conf = load_config(config)["extractors"][0] # legacy conf files if verbose: print("loading doc2vec config file from disk :") print(" - vector_size = " + str(conf["configuration"]["vector_size"])) print(" - window = " + str(conf["configuration"]["window"])) print(" - min_count = " + str(conf["configuration"]["min_count"])) # load the tokenizer tknzr = Tokenizer(token_level) if verbose: print("Selected token level : " + token_level + "\n") # Kfold parameters. confusion = array([[0 for x in range(len(labels))] for y in range(len(labels))]) scores = [] best_f_score = 0 best_pipeline = None best_model = None scores_micro = [] scores_macro = [] n_run = 1 k_fold = KFold(n_splits=10, shuffle=True) authors = array(authors) # start Kfold cross validation. for train_indices, test_indices in k_fold.split(authors): # import gensim lib (heavy load) from gensim import models as gensim_models # get doc2vec model model_dm = get_doc2vec(conf, 1, verbose) model_pv = get_doc2vec(conf, 0, verbose) # build train corpus train_authors = authors[train_indices] train_corpus = build_corpus(authors=train_authors, label_type=label_type, verbosity=verbose) # build test corpus test_authors = authors[test_indices] # learn the vocabulary (tokenisation of each tweet) tweets = list(zip(train_corpus["labels"], train_corpus["tweets"])) processed_tweets = [] idxs = [0 for l in labels] for t in tweets: prefix = t[0] + "_" + str(idxs[labels.index(t[0])]) idxs[labels.index(t[0])] += 1 processed_tweets.append( gensim_models.doc2vec.LabeledSentence(words=tknzr.tokenize( t[1]), tags=[prefix])) tweets = processed_tweets model_dm.build_vocab(tweets) model_pv.build_vocab(tweets) # train doc2vec model shuffle(tweets) model_dm.train(sentences=tweets, total_examples=model_dm.corpus_count, epochs=100, start_alpha=0.025, end_alpha=0.0025) model_dm.delete_temporary_training_data() model_pv.train(sentences=tweets, total_examples=model_pv.corpus_count, epochs=100, start_alpha=0.025, end_alpha=0.0025) model_pv.delete_temporary_training_data() # train dataset conversion (doc->vectors) train_vectors = zeros((sum(idxs), model_dm.vector_size * 2)) train_labels = [] for i, tag in enumerate(model_dm.docvecs.doctags): train_vectors[i] = concatenate( (model_dm.docvecs[tag], model_pv.docvecs[tag]), axis=0) train_labels.append(tag.split('_')[0]) train_labels = array(train_labels) # train classifier pipeline.fit(train_vectors, train_labels) # test models truthes = [] predictions = [] for author in test_authors: # test dataset conversion (doc->vectors) tweet_vectors = [ concatenate((model_dm.infer_vector(tknzr.tokenize(tweet)), model_pv.infer_vector(tknzr.tokenize(tweet))), axis=0) for tweet in author["tweets"] ] author_tmp = {"tweets": tweet_vectors} var_classes, var_predictions = predict_author_proba( author=author_tmp, model=pipeline) var_max_idx = var_predictions.index(max(var_predictions)) label_predicted = var_classes[var_max_idx] predictions.append(label_predicted) truthes.append(author[label_type]) # compute metrics confusion += confusion_matrix(truthes, predictions, labels=labels) score_micro = f1_score(truthes, predictions, labels=labels, average="micro") score_macro = f1_score(truthes, predictions, labels=labels, average="macro") if verbose: print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) + " macrof1=" + str(score_macro)) # store for avg scores_micro.append(score_micro) scores_macro.append(score_macro) n_run += 1 # save the pipeline if better than the current one if score_macro > best_f_score: best_model = [model_dm, model_pv] best_pipeline = clone(pipeline, True) best_f_score = score_macro if verbose: print("Model Cross Validation complete in %.3f seconds.\n" % (time() - t0)) scores = { "mean_score_micro": sum(scores_micro) / len(scores_micro), "mean_score_macro": sum(scores_macro) / len(scores_macro), "confusion_matrix": confusion, "best_macro_score": best_f_score, "labels": labels } return best_model, best_pipeline, scores
def optimize(options): ''' Optimize the given classifier or/and features extractor on a specified list of parameters Will proceed as follows : - loads the dataset - builds the corpus - load the parameters for tuning - loads the classifiers - loads the features extractors - builds the execution pipelines - trains and compares the different classifiers on the corpus - outputs the best set of parameters found ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("Label type not specified", "expected 'v' or 'g'") if not (options["hyper-parameters"]): abort_clean("hyper parameters not specified") if not (options["aggregation"]): abort_clean("Aggregation strategy not specified") #-------------------------------------------------------------------------- # Load the tweets in one language for variety or gender classification Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the optimize parameters try: params = load_config(options["hyper-parameters"]) except: abort_clean("Configuration couldn't be loaded", "given path: " + options["hyper-parameters"]) #-------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str=params["classifier-call"], config=None, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Load the features extractors features_extr = get_features_extr( features_str_list=params["features-extractr-call"], verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=options["verbosity"]) # Set the classifier and the parameters to be tuned tuning_parameters = get_opt_parameters(params) scores = params["scores"] if options["verbosity"]: print("Starting the optimization process ...") # Launch the tuning of hyper parameters for score in scores: print("Tuning hyper-parameters for %s" % score) optimize_corpus = build_corpus(authors=Authors, label_type=options["label-type"], verbosity=options["verbosity"]) clf_optimizer = GridSearchCV(estimator=pipeline, param_grid=tuning_parameters, scoring='%s_macro' % score, fit_params=None, n_jobs=-1, pre_dispatch='2*n_jobs', iid=True, cv=None, refit=True, verbose=options["verbosity"], error_score='raise', return_train_score=True) # Start optimisation clf_optimizer.fit(optimize_corpus["tweets"], optimize_corpus["labels"]) if options["verbosity"]: print("Best parameters set found on development set:") best_parameters = clf_optimizer.best_params_ for param_name in sorted(best_parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print() if options["verbosity"] > 1: print("Grid scores on development set:") means = clf_optimizer.cv_results_['mean_test_score'] stds = clf_optimizer.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf_optimizer.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) # saving results save_optimisation_results(grid=clf_optimizer, output_dir=options["output-dir"], score=score, verbose=options["verbosity"])
def train_model_cross_validation(authors, label_type, pipeline, verbose=1): ''' Takes a pipeline and train it on the specified corpus. Processes a cross-validation algorithm (K-fold) in order to evaluate the quality of the model. Returns the best trained pipeline (in terms of macro f-score). ''' labels = get_labels(lang=authors[0]["lang"], label_type=label_type) if not (labels): abort_clean("Could not extract labels") if verbose: print("Labels extraction succeded.") print("Available labels : " + " / ".join(labels) + "\n") if verbose: t0 = time() print("Starting model Cross Validation ... (this may take some time)") confusion = array([[0 for x in range(len(labels))] for y in range(len(labels))]) scores = [] best_f_score = 0 best_pipeline = None scores_micro = [] scores_macro = [] # start Kfold cross validation. n_run = 1 k_fold = KFold(n_splits=10, shuffle=True) authors = array(authors) for train_indices, test_indices in k_fold.split(authors): # build train corpus train_authors = authors[train_indices] train_corpus = build_corpus(authors=train_authors, label_type=label_type, verbosity=verbose) # build test corpus test_authors = authors[test_indices] # train model pipeline = train_model(corpus=train_corpus, pipeline=pipeline, verbose=0) # test model truthes = [] predictions = [] for author in test_authors: var_classes, var_predictions = predict_author_proba(author=author, model=pipeline) var_max_idx = var_predictions.index(max(var_predictions)) label_predicted = var_classes[var_max_idx] predictions.append(label_predicted) truthes.append(author[label_type]) # compute metrics confusion += confusion_matrix(truthes, predictions, labels=labels) score_micro = f1_score(truthes, predictions, labels=labels, average="micro") score_macro = f1_score(truthes, predictions, labels=labels, average="macro") if verbose: print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) + " macrof1=" + str(score_macro)) # store for avg scores_micro.append(score_micro) scores_macro.append(score_macro) n_run += 1 # save the pipeline if better than the current one if score_macro > best_f_score: best_pipeline = clone(pipeline, True) best_f_score = score_macro if verbose: print("Model Cross Validation complete in %.3f seconds.\n" % (time() - t0)) scores = { "mean_score_micro": sum(scores_micro) / len(scores_micro), "mean_score_macro": sum(scores_macro) / len(scores_macro), "confusion_matrix": confusion, "best_macro_score": best_f_score, "labels": labels } return best_pipeline, scores
def train(options): ''' Trains a specified classifier on a specified dataset using specified feature extractors. Will proceed as follows : - loads the dataset - builds the corpus - loads the classifier - loads the features extractor - builds the execution pipeline - trains the classifier on the corpus - cross-validates the resulting model [optional] - saves the resulting model [optional] ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("Labels not specified", "expected 'l', 'g' or 'v'") if not (options["features"]) and not (options["gensim"]): abort_clean("Features not specified") if not (options["classifier"]): abort_clean("Classifier not specified") if not (options["aggregation"]): abort_clean("Aggregation strategy not specified") #-------------------------------------------------------------------------- # Load the tweets in one language for variety or gender classification Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str=options["classifier"][0], config=None, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Load the features extractors features_extr = None if not (options["gensim"]): features_extr = get_features_extr( features_str_list=options["features"][0], verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Train the execution pipeline # train and cross validate results if (options["cross-validation"]): if (options["verbosity"]): print("Model Training with cross validation\n") if options["gensim"]: model, pipeline, scores = train_model_gensim_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, config=options["hyper-parameters"], token_level=options["token-level"], verbose=options["verbosity"]) else: pipeline, scores = train_model_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, verbose=options["verbosity"]) if options["verbosity"]: print_scores(scores) if options["output-dir"]: if options["gensim"]: filename = str("doc2vec" + "-siz_" + str(model[0].vector_size) + "-win_" + str(model[0].window) + "-cnt_" + str(model[0].min_count) + get_classifier_name(classifier)) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_scores(scores=scores, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) # train without validation --> output-dir required else: if options["verbosity"]: print("Model Training without cross validation\n") if not (options["output-dir"]): abort_clean("No output directory specified.", "Training without persisting is not allowed") train_corpus = build_corpus(authors=Authors, label_type=options["label-type"], verbosity=options["verbosity"]) pipeline = train_model(corpus=train_corpus, pipeline=pipeline, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Save the resulting model if options["gensim"]: filename = "doc2vec+" + get_classifier_name(classifier) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_model(pipeline=pipeline, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # End Execution if options["verbosity"]: print("Training task complete in " + str(round(time() - t0)) + " s")
def main(): parser = options.get_parser('Generator') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_generation_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() caseless = args.caseless batch_size = args.batch_size if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) else: print('No checkpoint file found: {}'.format(args.load_checkpoint)) raise OSError train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=True) test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus] # preprocessing feature_map = checkpoint_file['f_map'] target_map = checkpoint_file['t_map'] test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless) # train/val split test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = utils.build_model(args, vocab_size, tagset_size) # loss criterion = utils.build_loss(args) # load states model.load_state_dict(checkpoint_file['state_dict']) # trainer trainer = SeqTrainer(args, model, criterion) if args.cuda: model.cuda() y_true, y_pred, att_weights = predict(trainer, test_loader, target_map, cuda=args.cuda) assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set' # prediction print('Predicting...') assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set' # write result: sent_id|e1|e2|ddi|type with open(args.predict_file, 'w') as f: for tup, pred in zip(test_raw_corpus, y_pred): ddi = 0 if pred == 'null' else 1 f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred])) f.write('\n') # error analysis print('Analyzing...') with open(args.error_file, 'w') as f: f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred'])) f.write('\n') for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights): if target != pred: size = len(tup.sent) f.write('{}\n'.format(' '.join(tup.sent))) if args.model != 'InterAttentionLSTM': att_weight = [att_weight] for i in range(len(att_weight)): f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size])))) f.write('{}\n\n'.format(' | '.join([tup.sent_id, tup.e1, tup.e2, target, pred]))) # attention print('Writing attention scores...') with open(args.att_file, 'w') as f: f.write(' | '.join(['target', 'sent', 'att_weight'])) f.write('\n') for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights): if target == pred and target != 'null': size = len(tup.sent) f.write('{}\n'.format(target)) f.write('{}\n'.format(' '.join(tup.sent))) if args.model != 'InterAttentionLSTM': att_weight = [att_weight] for i in range(len(att_weight)): f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size]))))
DATABASE = utils.get_file_path(cfg.DATABASE_FILE) content = help_content.HelpContent(DATABASE) # print( help( corpora.dictionary ) ) should_rebuild = False # ### Dictionary ### dict_file = utils.get_file_path(cfg.DICT_BACKUP) # dictionary = corpora.dictionary.Dictionary.load(dict_file) dictionary = utils.build_dictionary(content, should_rebuild, cfg.DICT_BACKUP) # ### Corpus ### corpus_file = utils.get_file_path(cfg.CORPUS_BACKUP) # utils.pickle_save(corpus_file, corpus) # corpus = corpora.MmCorpus(corpus_file) corpus = utils.build_corpus(dictionary, content, should_rebuild, cfg.CORPUS_BACKUP) # corpus = pickle.load( open( corpus_file, "rb" ) ) # print( cfg.MODEL_NAME ) # ### LDA Model ### bow = dictionary.doc2bow(utils.get_cleaned_text(query.lower()).split()) # bag_of_words = [word for word in bow] model = utils.build_model(dictionary, corpus, should_rebuild) q_vec = model[bow] # "query vector" # topic_details = list() topic_details = model.print_topic(max(q_vec, key=lambda item: item[1])[0]) print('Dictionary Size = {}'.format(len(dictionary))) print('Corpus Size = {}'.format(len(corpus))) print('Topic Details: ')
features_array.append(features) inflated_feats = [] for dense in features_array: sparse = np.zeros(len(features_all)) for i in dense: sparse[i] = 1 inflated_feats.append(sparse) A = np.array(inflated_feats) return scipy.sparse.csr_matrix(A), np.array(labels), features_all def save_model(clf, features): joblib.dump(clf, 'model.pkl') pickle.dump(features, open('feature.pkl', 'wb')) if __name__ == '__main__': corpus_file = sys.argv[1] annot_file = sys.argv[2] annotations = read_annotation_files(annot_file) print "Read annotation files " + str(passed_time(start_time)) sentences = ut.build_corpus(open(corpus_file, "r").read().split("\n")) print "Read the corpus " + str(passed_time(start_time)) features, tags, all_features = build_datas(annotations, sentences) print "Build the features " + str(passed_time(start_time)) clf = svm.LinearSVC() clf.fit(features, tags) save_model(clf, all_features) print "Saved model " + str(passed_time(start_time))
def main(): parser = options.get_parser('Generator') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_generation_args(parser) args = parser.parse_args() model_path = args.load_checkpoint + '.model' args_path = args.load_checkpoint + '.json' with open(args_path, 'r') as f: _args = json.load(f)['args'] [setattr(args, k, v) for k, v in _args.items()] args.cuda = not args.disable_cuda and torch.cuda.is_available() print(args) if args.cuda: torch.backends.cudnn.benchmark = True # increase recursion depth sys.setrecursionlimit(10000) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus( args.processed_dir, ddi=False) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] caseless = args.caseless batch_size = args.batch_size # build vocab sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) target_map = ddi2013.target_map # get class weights _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor( utils.get_class_weights(train_targets)) if args.class_weight else None # load dataets _, _, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = RelationTreeModel(vocab_size, tagset_size, args) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states assert os.path.isfile(model_path), "Checkpoint not found!" print('Loading checkpoint file from {}...'.format(model_path)) checkpoint_file = torch.load(model_path) model.load_state_dict(checkpoint_file['state_dict']) # trainer trainer = TreeTrainer(args, model, criterion) # predict y_true, y_pred, treelists, f1_by_len = predict(trainer, test_loader, target_map, cuda=args.cuda) # assign words to roots for tup, treelist in zip(test_raw_corpus, treelists): for t in treelist: t.idx = tup.sent[t.idx] if t.idx < len(tup.sent) else None # prediction print('Predicting...') # write result: sent_id|e1|e2|ddi|type with open(args.predict_file, 'w') as f: for tup, pred in zip(test_raw_corpus, y_pred): ddi = 0 if pred == 'null' else 1 f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred])) f.write('\n') def print_info(f, tup, target, pred, root): f.write('{}\n'.format(' '.join(tup.sent))) f.write('{}\n'.format(' | '.join( [tup.sent_id, tup.e1, tup.e2, target, pred]))) f.write('{}\n\n'.format(root)) # error analysis print('Analyzing...') with open(args.error_file, 'w') as f: f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred'])) f.write('\n') for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred, treelists): if target != pred: print_info(f, tup, target, pred, treelist[-1]) # attention print('Writing attention scores...') with open(args.correct_file, 'w') as f: f.write(' | '.join(['target', 'sent', 'att_weight'])) f.write('\n') for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred, treelists): if target == pred and target != 'null': print_info(f, tup, target, pred, treelist[-1])
def main(): parser = options.get_parser('Trainer') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() torch.manual_seed(5) if args.cuda: torch.backends.cudnn.benchmark = True # increase recursion depth sys.setrecursionlimit(10000) # checkpoint checkpoint_dir = os.path.dirname(args.checkpoint) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=False) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] start_epoch = 0 caseless = args.caseless batch_size = args.batch_size num_epoch = args.num_epoch # build vocab sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) target_map = ddi2013.target_map # get class weights _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None train_loader, val_loader, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = RelationTreeModel(vocab_size, tagset_size, args) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) start_epoch = checkpoint_file['epoch'] + 1 model.load_state_dict(checkpoint_file['state_dict']) # optimizer.load_state_dict(checkpoint_file['optimizer']) else: print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint)) if not args.rand_embedding: pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim) print(pretrained_word_embedding.size()) print(vocab_size) model.load_pretrained_embedding(pretrained_word_embedding) if args.disable_fine_tune: model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words model.rand_init(init_embedding=args.rand_embedding) # trainer trainer = TreeTrainer(args, model, criterion) best_f1 = float('-inf') if os.path.isfile(args.load_checkpoint): dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) best_f1 = dev_f1 print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format( dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1)) track_list = [] patience_count = 0 start_time = time.time() q = mp.Queue() # set start methods try: mp.set_start_method('spawn') except RuntimeError: pass for epoch in range(start_epoch, num_epoch): epoch_loss = train(train_loader, trainer, epoch) # processes = [] # for rank in range(args.num_processes): # p = mp.Process(target=train, args=(train_loader, trainer, epoch, q)) # p.start() # processes.append(p) # for p in processes: # p.join() # # epoch_loss = q.get() # update lr trainer.lr_step(epoch_loss) dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) if dev_f1 >= best_f1: patience_count = 0 best_f1 = dev_f1 track_list.append({'epoch': epoch, 'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) try: utils.save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': trainer.optimizer.state_dict(), 'f_map': feature_map, 't_map': target_map, }, {'track_list': track_list, 'args': vars(args) }, args.checkpoint) except Exception as inst: print(inst) else: patience_count += 1 track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time)) if patience_count >= args.patience: break