def doc_test(log_file): """Test the model qualitatively on a document. Args: log_file: where to store the output document. """ if os.path.isfile('./' + log_file + '_doc'): raise ValueError('log file already exists') # The temp file created here splits the input document into line by line # representations to make it easier to see how translations were done on a # sentence level. with tf.Session() as sess, open(log_file + '_doc', 'w+') as log, open( dc.NORMAL_DOC_PATH, 'r+') as doc_file, open('temp_doc', 'w+') as temp: doc_to_translate = doc_file.readline() sentences = [ sentence + '.' for sentence in doc_to_translate.split('.') ] model = create_model(sess, True) model.batch_size = 1 normal_vocab, _ = data_utils.get_vocabulary(dc.NORMAL_VOCAB_PATH) _, rev_simple_vocab = data_utils.get_vocabulary(dc.SIMPLE_VOCAB_PATH) for sentence in sentences: translation = pipe_sentence(sentence, normal_vocab, rev_simple_vocab, sess, model) log.write(translation + '\n') temp.write(sentence + '\n')
def main(): args, settings = parse_args_and_settings() logger = output_utils.Logger(args) logger.shout('python main.py ' + ' '.join(sys.argv[1:])) num_threads = 5 torch.set_num_threads(num_threads) # Load entity map (needed for all phases; primarily for loading data): entity_idx_to_name, entity_name_to_idx = data_utils.load_entity_map(settings.data.entity_map) if args.phase == 'train' or args.phase == 'deploy': # Loading train data is needed for train, but also for deploy, namely if vocabulary doesn't exist yet: train_data = None if args.phase == 'train' or not os.path.exists(settings.data.vocabulary): train_data = data_utils.load_data(settings.data.dataset, entity_name_to_idx, with_keys=True, logger=logger) # Load vocabulary (and extract from train_data if vocabulary doesn't exist yet) vocabulary_idx_to_word, vocabulary_word_to_idx = data_utils.get_vocabulary(settings.data.vocabulary, extract_from=train_data, logger=logger) # Avoid loading/generating google news embeddings in deploy phase: if args.phase == 'deploy' and settings.model.token_emb == config_utils.data_paths["embeddings"]["google_news"]: settings.model.token_emb = 300 # Appropriate embeddings will be loaded anyway from saved .pt model file. # TODO: This won't generalize when using other embeddings. # Load embeddings if needed: if isinstance(settings.model.token_emb, str): settings.model.token_emb = embedding_loader.load_word_embeddings(settings.model.token_emb, settings.data.dataset, train_data, logger) if isinstance(settings.model.speaker_emb, str): settings.model.speaker_emb = embedding_loader.load_entity_embeddings(settings.model.speaker_emb, settings.data.entity_map, logger) # convenient to compute and store some dependent parameters: settings.model.vocabulary_size = len(vocabulary_idx_to_word) settings.model.num_entities = len(entity_idx_to_name) if args.phase == 'train': logger.save_config(settings.orig) logger.say(output_utils.bcolors.BOLD + 'Training on ' + settings.data.dataset) run_training(settings, train_data, vocabulary_idx_to_word, vocabulary_word_to_idx, logger, not args.no_cuda) if args.phase == 'deploy': logger.say(output_utils.bcolors.BOLD + 'Deploying ' + str(len(args.model)) + ' models (' + ( args.run_name if len(args.model) > 1 else args.model[0]) + ')...\n ...on ' + ('folds of ' if not args.no_cv else '') + args.deploy_data) args.answer_file, with_keys = run_deploy(args.model, settings, args.deploy_data, vocabulary_idx_to_word, vocabulary_word_to_idx, entity_name_to_idx, args.answers_per_fold, args.no_cv, logger, not args.no_cuda) # After deploying, evaluate (unless not desired or data does not contain reference keys): if not args.no_eval: if with_keys is True: args.phase = 'evaluate' else: logger.shout('Warning: Model predictions will not be evaluated, since given data does not contain reference labels. ') if args.phase == 'evaluate': logger.say(output_utils.bcolors.BOLD + 'Evaluating ' + ('(not SemEval style) ' if args.no_semeval else '(SemEval style) ') + 'predictions of ' + args.answer_file) run_evaluate(args.answer_file, args.deploy_data, entity_name_to_idx, entity_idx_to_name, args.no_semeval, logger)
def load_word_embeddings(embeddings_fname, training_datapath, training_data, logger=None): """ :param embeddings_fname: The name of the file containing pre-trained embeddings. E.g., the Google-news w2v embeddings :param training_datapath: The name of the file containing the training data for a model which uses word embeddings (loaded from embeddings_fname). """ # vocab_fname: The name of the file containing the relevant vocabulary. # Each line contains the word idx and the word, separated by tabs ("\t"). vocab_fname = training_datapath.replace(".conll", ".vocab") word_emb_fname = data_utils.get_embeddings_path_for_vocab( embeddings_fname, vocab_fname) if os.path.exists(word_emb_fname): if logger: logger.whisper( "Loading token embedding from {0}".format(word_emb_fname)) word_embeddings = np.load(word_emb_fname) else: vocabulary_idx_to_word, _ = data_utils.get_vocabulary( vocab_fname, extract_from=training_data, logger=logger) all_word_vectors = load_word2vec_embeddings(embeddings_fname) word_embeddings, _, _ = filter_embeddings(all_word_vectors, vocabulary_idx_to_word) save_word_embeddings(word_embeddings, word_emb_fname) return word_embeddings
def input_test(): """Input your own sentence and see how the model interprets it.""" with tf.Session() as sess: model = create_model(sess, True) model.batch_size = 1 normal_vocab, _ = data_utils.get_vocabulary(dc.NORMAL_VOCAB_PATH) _, rev_simple_vocab = data_utils.get_vocabulary(dc.SIMPLE_VOCAB_PATH) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: translation = pipe_sentence(sentence, normal_vocab, rev_simple_vocab, sess, model) print translation print "> " sys.stdout.flush() sentence = sys.stdin.readline()
def get_data(args): df = pd.read_csv(PROTO_TSV, sep='\t') # Sentences sent_ids = set(df['Sentence.ID'].tolist()) print(f'There are {len(sent_ids)} unique sentences.') sents_path = os.path.join(PICKLED_DIR, 'sents.pkl') sents = None if os.path.exists(sents_path) and not 'sents' in args.init_list: with open(sents_path, 'rb') as f: sents = pickle.load(f) else: with open(sents_path, 'wb') as f: sents = data_utils.get_nltk_sents(sent_ids) pickle.dump(sents, f) # Dependency data dependencies_path = os.path.join(PICKLED_DIR, 'dependencies.pkl') if os.path.exists(dependencies_path) and not 'deps' in args.init_list: with open(dependencies_path, 'rb') as f: deps, deps_just_tokens = pickle.load(f) else: with open(dependencies_path, 'wb') as f: deps, deps_just_tokens = data_utils.get_dependencies(sent_ids) pickle.dump((deps, deps_just_tokens), f) sents['dependencies'] = deps sents['deps_just_tokens'] = deps_just_tokens # Instances instances_path = os.path.join(PICKLED_DIR, 'instances.pkl') proto_instances = None possible = None # Data to compare to SPRL paper if os.path.exists(instances_path) and not 'instances' in args.init_list: with open(instances_path, 'rb') as f: proto_instances, possible = pickle.load(f) else: proto_instances, possible = data_utils.build_instance_list(df) data_utils.add_pred_args(proto_instances, sents['trees']) with open(instances_path, 'wb') as f: pickle.dump((proto_instances, possible), f) # Matching between raw and dependency data if args.model_type != 'lstm': data_utils.match_conllu_to_raw(sents['raw'], deps) # no corresponding overwrite here since running logreg on local machine else: data_utils.match_raw_to_conllu(proto_instances, sents['raw'], deps_just_tokens) with open(instances_path, 'wb') as f: pickle.dump((proto_instances, possible), f) # Word embedding data sent_ids = {} # Redefining sent_ids for this section for split in SPLITS: sent_ids[split] = [pt['Sentence.ID'] for pt in proto_instances[split]] w2e = None glove_path = os.path.join(PICKLED_DIR, f'glove_{args.glove_d}.pkl') if os.path.exists(glove_path) and not 'glove' in args.init_list: with open(glove_path, 'rb') as f: w2e = pickle.load(f) else: vocab = data_utils.get_vocabulary(deps_just_tokens) w2e = data_utils.w2e_from_file(GLOVE_FILE[args.glove_d], vocab=vocab) with open(glove_path, 'wb') as f: pickle.dump(w2e, f) w2i, i2w = None, None emb_np = None X, y = None, None if args.model_type == 'lstm': dicts_path = os.path.join(PICKLED_DIR, 'dicts.pkl') if os.path.exists(dicts_path) and not 'dicts' in args.init_list: with open(dicts_path, 'rb') as f: w2i, i2w = pickle.load(f) else: w2i, i2w = data_utils.build_dicts(sents['deps_just_tokens'], sent_ids=sent_ids, glove_vocab=sorted( list(w2e.keys()))) with open(dicts_path, 'wb') as f: pickle.dump((w2i, i2w), f) emb_np_path = os.path.join(PICKLED_DIR, 'emb_np.pkl') if os.path.exists(emb_np_path) and not 'emb_np' in args.init_list: with open(emb_np_path, 'rb') as f: emb_np = pickle.load(f) else: emb_np = data_utils.build_emb_np(w2e, w2i=w2i, i2w=i2w) with open(emb_np_path, 'wb') as f: pickle.dump(emb_np, f) lstm_data_path = os.path.join(PICKLED_DIR, 'lstm_data.pkl') if os.path.exists( lstm_data_path) and not 'lstm_data' in args.init_list: with open(lstm_data_path, 'rb') as f: X, y = pickle.load(f) else: # Proto instances modified in-place here data_utils.get_arg_head_idx(proto_instances, sents['dependencies'], sents['deps_just_tokens']) with open(instances_path, 'wb') as f: pickle.dump((proto_instances, possible), f) numericalized = data_utils.numericalize(sents['deps_just_tokens'], w2i) X = {} y = {} for split in SPLITS: X[split], y[split] = data_utils.get_ins_outs_lstm( proto_instances[split], numericalized) with open(lstm_data_path, 'wb') as f: pickle.dump((X, y), f) num_instances = sum([len(x) for x in proto_instances.values()]) print(f'There are {num_instances} instances.') return { 'df': df, 'proto_instances': proto_instances, 'possible': possible, 'sents': sents, 'w2e': w2e, 'sent_ids': sent_ids, 'lstm_data': (X, y), 'dicts': (w2i, i2w), 'emb_np': emb_np }