def decode(args: Dict[str, str]): """ performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ print(f"load test source sentences from [{args['TEST_SOURCE_FILE']}]", file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print(f"load test target sentences from [{args['TEST_TARGET_FILE']}]", file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print(f"load model from {args['MODEL_PATH']}", file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def compute_pr(model): '''Takes in a model to produce tags, then evaluates PR of model''' print "Evaluating %s" % (model) (vocab, tf) = utils.read_corpus() top_words = models.run_model(model, vocab, tf) average_recall = avg_recall(top_words) avgerage_precision = avg_precision(top_words) print "Average Precision: %f" %(avg_precision) print "Average Precision: %f" %(avg_precision)
def main(): """ Main func. """ args = docopt(__doc__) # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__) assert(torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) # Load training data & vocabulary train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src') train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt') train_data = list(zip(train_data_src, train_data_tgt)) for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True): src_sents = src_sents tgt_sents = tgt_sents break vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT( embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) if args['1d']: question_1d_sanity_check(model, src_sents, tgt_sents, vocab) elif args['1e']: question_1e_sanity_check(model, src_sents, tgt_sents, vocab) elif args['1f']: # generate_outputs(model, src_sents, tgt_sents, vocab) question_1f_sanity_check(model, src_sents, tgt_sents, vocab) else: raise RuntimeError('invalid run mode')
def build_vocab(args): if not os.path.exists(args.vocab_path): src_sents, labels = read_corpus(args.train_data_dir) labels = {label: idx for idx, label in enumerate(labels)} vocab = Vocab.build(src_sents, labels, args.max_vocab_size, args.min_freq) vocab.save(args.vocab_path) else: vocab = Vocab.load(args.vocab_path) return vocab
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) output_file = args['OUTPUT_FILE'] os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w+') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write('source sentence: ' + ' '.join(src_sent) + '\n') f.write('translation: ' + hyp_sent + '\n')
def test(args): """ Testing the model Args: args: dict that contains options in command """ sent_vocab = Vocab.load(args.SENT_VOCAB) tag_vocab = Vocab.load(args.TAG_VOCAB) sentences, tags = utils.read_corpus(args.TEST) sentences = utils.words2indices(sentences, sent_vocab) tags = utils.words2indices(tags, tag_vocab) test_data = list(zip(sentences, tags)) print('num of test samples: %d' % (len(test_data))) # device = torch.device('cuda' if args.cuda else 'cpu') device = torch.device('cuda:0') model = bilstm_crf.BiLSTMCRF.load(args.MODEL, device) print('start testing...') print('using device', device) start = time.time() n_iter, num_words = 0, 0 tp, fp, fn = 0, 0, 0 model.eval() with torch.no_grad(): for sentences, tags in utils.batch_iter(test_data, batch_size=int( args.batch_size), shuffle=False): sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD], device) predicted_tags = model.predict(sentences, sent_lengths) n_iter += 1 num_words += sum(sent_lengths) for tag, predicted_tag in zip(tags, predicted_tags): current_tp, current_fp, current_fn = cal_statistics( tag, predicted_tag, tag_vocab) tp += current_tp fp += current_fp fn += current_fn if n_iter % int(args.log_every) == 0: print( 'log: iter %d, %.1f words/sec, precision %f, recall %f, f1_score %f, time %.1f sec' % (n_iter, num_words / (time.time() - start), tp / (tp + fp), tp / (tp + fn), (2 * tp) / (2 * tp + fp + fn), time.time() - start)) num_words = 0 start = time.time() print('tp = %d, fp = %d, fn = %d' % (tp, fp, fn)) precision = tp / (tp + fp) recall = tp / (tp + fn) f1_score = (2 * tp) / (2 * tp + fp + fn) print('Precision: %f, Recall: %f, F1 score: %f' % (precision, recall, f1_score))
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model1 = NMT.load(args['MODEL_PATH']) model2 = DPPNMT.load(args['MODEL_PATH']) if args['INDEX']: index = int(args['INDEX']) beam_search2( model1, model2, [test_data_src[index]], 5, 70, [test_data_tgt[index]], ) else: beam_search2( model1, model2, test_data_src, #int(args['--beam-size']), 5, #int(args['--max-decoding-time-step']), 70, test_data_tgt, )
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) #accuracy (unigrams) perfectly_correct = 0 for index,hyp in enumerate(top_hypotheses): if hyp.value[0] == test_data_tgt[index][1]: perfectly_correct += 1 print('Ignore accuracy for non unigrams') print('Accuracy: {}'.format(perfectly_correct / len(test_data_tgt)), file=sys.stderr) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def decode(load_from=None): """ performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ if gconfig.test: data_src = read_corpus(paths.test_source, source='src') data_tgt = read_corpus(paths.test_target, source='tgt') data_tgt_path = paths.test_target else: data_src = read_corpus(paths.dev_source, source='src') data_tgt = read_corpus(paths.dev_target, source='tgt') data_tgt_path = paths.dev_target print(f"load model from {paths.model}", file=sys.stderr) if load_from is not None: model_load_path = load_from else: model_load_path = paths.model model = NMTModel.load(model_load_path) if gconfig.cuda: model.to_gpu() model.eval() max_step = dconfig.max_decoding_time_step if gconfig.sanity: max_step = 2 hypotheses = routine.beam_search(model, data_src, max_step, replace=dconfig.replace) lines = [] for src_sent, hyps in zip(data_src, hypotheses): top_hyp = hyps[0] lines.append(top_hyp.value) write_sents(lines, paths.decode_output) bleu_command = "perl scripts/multi-bleu.perl " + data_tgt_path + " < " + paths.decode_output os.system(bleu_command)
def translate(model): if model.args.en_cn: test_src, test_tgt = read_en_cn_corpus( model.args.test_data, source=model.args.source_language) elif model.args.en_es: test_src = read_corpus(model.args.test_src_data, False) test_tgt = read_corpus(model.args.test_tgt_data, True) else: print("invalid input") exit(0) train_data = list(zip(test_src, test_tgt)) with open(args.translate_result_save_path, 'w') as f: for src_sent, tgt_sent in batch_iter(train_data, 1, shuffle=False): tgt_predict = model.translate(src_sent[0]) tgt_predict = ' '.join(tgt_predict[0]['sentence']) tgt_sent = tgt_sent[0] tgt = ' '.join(tgt_sent[1:-1]) line = "translate: " + tgt_predict + " " * 5 + "target: " + tgt + '\r\n' f.write(line) f.close()
def filter_subs(input_data_path: str, output_data_path: str): print('Reading data...') subs = read_corpus(input_data_path) pattern = re.compile('^(?:[A-z]|[А-я]|[ёЁ\d\s.,!:?\-––\'"%$()`])+$') print('Filtering...') filtered = [s for s in tqdm(subs) if pattern.match(s)] print('Removing too long sentences...') short = [s for s in tqdm(filtered) if len(s.split()) <= 50 and len(s) <= 250] print('Saving...') save_corpus(short, output_data_path) print('Done!')
def decodeOT(args: Dict[str, str]): """ performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print(f"load model from {args['MODEL_PATH']}", file=sys.stderr) if os.path.exists(args['MODEL_PATH']): model = NMT.load(args['MODEL_PATH']) else: model = NMT(256, 256, pickle.load(open('data/vocab.bin', 'rb'))) # Set models to eval (disables dropout) model.encoder.eval() model.decoder.eval() hypotheses = beam_searchOT(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value.split()[1:-1]) f.write(hyp_sent + '\n') # Back to train (not really necessary for now) model.encoder.train() model.decoder.train()
def decode(args: Dict[str, str]): """ performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ vocab = pickle.load(open(args['--vocab'], 'rb')) srcEntry = vocab.src # VocabEntry for src tgtEntry = vocab.tgt # VocabEntry for tgt device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) test_data_src = read_corpus(args['--test-src'], source='src') test_data = Testset(srcEntry.words2indices(test_data_src)) test_loader = DataLoader(test_data, 1, shuffle=False) if args['--test-tgt']: test_data_tgt = read_corpus(args['--test-tgt'], source='tgt') print(f"load model from {args['--model-path']}", file=sys.stderr) model = torch.load(args['--model-path']) hypotheses = beam_search(model, test_loader, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step']), tgtEntry=tgtEntry, device=device) if args['--test-tgt']: bleu_score = compute_corpus_level_bleu_score(test_data_tgt, hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(args['--output-path'], 'w') as f: for src_sent, top_hyp in zip(test_data_src, hypotheses): hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) if args['--dpp'] and args['--sampling']: print("Error: cannot specify --dpp and --sampling simultaneously") elif args['--dpp']: print("Loading DPP Model") model = DPPNMT.load(args['MODEL_PATH']) else: model = NMT.load(args['MODEL_PATH']) if args['--sampling']: model.sampling = True if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def detokenize(input_file, output_file): print('Reading data...') texts = read_corpus(input_file) print('Data reading finished!') print('Detokenizing...') detokenized = [detokenizer.detokenize(s.split()) for s in tqdm(texts)] print('Detokenized!') print('Saving...') save_corpus(detokenized, output_file) print('Saved!')
def decode(args: Dict[str, str]): test_data_src, failed_ids_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') test_data_tgt, failed_ids_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') total_failed_ids = set(failed_ids_src).union(failed_ids_tgt) test_data_src = [ test_data_src[i] for i in range(len(test_data_src)) if i not in total_failed_ids ] test_data_tgt = [ test_data_tgt[i] for i in range(len(test_data_tgt)) if i not in total_failed_ids ] print(f"load model from {args['MODEL_PATH']}", file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:2")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def greedy_test(args): """ Test function """ # load vocabulary vocab = torch.load(args.vocab) # build model translator = Transformer(args, vocab) translator.eval() # load parameters translator.load_state_dict(torch.load(args.decode_model_path)) if args.cuda: translator = translator.cuda() test_data = read_corpus(args.decode_from_file, source="src") # ['<BOS>', '<PAD>', 'PAD', '<PAD>', '<PAD>'] pred_data = len(test_data) * [[ constants.PAD_WORD if i else constants.BOS_WORD for i in range(args.decode_max_steps) ]] output_file = codecs.open(args.decode_output_file, "w", encoding="utf-8") for test, pred in zip(test_data, pred_data): pred_output = [constants.PAD_WORD] * args.decode_max_steps test_var = to_input_variable([test], vocab.src, cuda=args.cuda) # only need one time enc_output = translator.encode(test_var[0], test_var[1]) for i in range(args.decode_max_steps): pred_var = to_input_variable([pred[:i + 1]], vocab.tgt, cuda=args.cuda) scores = translator.translate(enc_output, test_var[0], pred_var) _, argmax_idxs = torch.max(scores, dim=-1) one_step_idx = argmax_idxs[-1].item() pred_output[i] = vocab.tgt.id2word[one_step_idx] if (one_step_idx == constants.EOS) or (i == args.decode_max_steps - 1): print("[Source] %s" % " ".join(test)) print("[Predict] %s" % " ".join(pred_output[:i])) print() output_file.write(" ".join(pred_output[:i]) + "\n") output_file.flush() break pred[i + 1] = vocab.tgt.id2word[one_step_idx] output_file.close()
def decode(args, test_data_src, test_data_tgt=None): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ print("load test source sentences from [{}]".format(test_data_src)) test_data_src = read_corpus(test_data_src, source='src') if test_data_tgt: print("load test target sentences from [{}]".format(test_data_tgt)) test_data_tgt = read_corpus(test_data_tgt, source='tgt') print("load model from {}".format(args.model_path)) model = NMT.load(args.model_path) if args.cuda: model = model.to(torch.device("cuda:0")) hypotheses = beam_search( model, test_data_src, beam_size=args.beam_size, max_decoding_time_step=args.max_decoding_time_step) if args.test_tgt: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100)) if args.output_file: print("Saving predictions to " + args.output_file) with open(args.output_file, 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n') else: print("No output_file given, not saving predictions")
def main(args): if os.path.isfile(os.path.join(args.data_path, args.vocab_file)): print('vocab file exist in %s.' % os.path.join(args.data_path, args.vocab_file)) else: print('Reading in source sentence : %s ...' % args.train_src) src_sents = read_corpus(os.path.join(args.data_path, args.train_src), source='src') print('Reading in target sentence : %s ...' % args.train_tgt) tgt_sents = read_corpus(os.path.join(args.data_path, args.train_tgt), source='tgt') vocab = Vocab.build(src_sents, tgt_sents, vocab_size=args.size, freq_cutoff=args.freq_cutoff) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) vocab.save(os.path.join(args.data_path, args.vocab_file)) print('vocabulary saved to %s' % os.path.join(args.data_path, args.vocab_file))
def decode(args: Dict[str, str], vocab): """ performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print(f"load model from {args['MODEL_PATH']}", file=sys.stderr) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), n_layers=int(args['--n_layers']), vocab=vocab, tie_weights=int(args['--tie-weights']), mha=int(args['--mha'])) if torch.cuda.is_available(): model = model.cuda() model.load(args['MODEL_PATH']) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def load_data(xml_file, vectorizer): texts, labels = [], [] for values, label in utils.read_corpus(xml_file): texts.append(values[0]) labels.append(label) feas = vectorizer.fit_transform(texts) # pdb.set_trace() labeled_data = {"data": feas, "label": labels} print "the feature of data:", feas.shape return labeled_data
def main(): args = docopt(__doc__) sentences, tags = read_corpus(args['TRAIN']) sent_vocab = Vocab.build(sentences, int(args['--max-size']), int(args['--freq-cutoff']), is_tags=False) tag_vocab = Vocab.build(tags, int(args['--max-size']), int(args['--freq-cutoff']), is_tags=True) sent_vocab.save(args['SENT_VOCAB']) tag_vocab.save(args['TAG_VOCAB'])
def generate_sentiment_words(neg_input_path:str, pos_input_path:str, neg_output_path:str, pos_output_path:str, keep_n_most_popular_words:int=3000): print('Reading data...') neg_lines = read_corpus(neg_input_path) pos_lines = read_corpus(pos_input_path) print('Counting words') neg_counter = Counter([w.lower() for s in tqdm(neg_lines) for w in s.split()]) pos_counter = Counter([w.lower() for s in tqdm(pos_lines) for w in s.split()]) print('Getting most popular') neg_top_words = set(w for w, _ in neg_counter.most_common(keep_n_most_popular_words)) pos_top_words = set(w for w, _ in pos_counter.most_common(keep_n_most_popular_words)) only_neg_top_words = neg_top_words - pos_top_words only_pos_top_words = pos_top_words - neg_top_words print('Saving') save_corpus(list(only_neg_top_words), neg_output_path) save_corpus(list(only_pos_top_words), pos_output_path) print('Done!')
def decode(args: Dict[str, str]): """ performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ print(f"load test source sentences from [{args['TEST_SOURCE_FILE']}]", file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print(f"load test target sentences from [{args['TEST_TARGET_FILE']}]", file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print(f"load model from {args['MODEL_PATH']}", file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def main(): head_funcs = {} pairs = parse_pairs(sys.stdin) head_np, np_head = parse_NPs(open(sys.argv[1]), pairs) id_for_sent = [] c = Counter() for token in read_corpus(open(sys.argv[2])): if token.id == 'SENT': for i in id_for_sent: head_funcs[i]['verbs'] = c['V'] head_funcs[i]['oids'] = c['o'] head_funcs[i]['conjs'] = c['C'] head_funcs[i]['nouns'] = c['N'] id_for_sent = [] c.clear() continue add_pos_count(token, c) if token.id in head_np.keys(): in_pair = _in_pair(token.id, head_np, pairs) if in_pair: id_for_sent.append(token.id) hf = {} hf['isNomn'], hf['isNpro'], hf['gnc'] = [hhf(token) for hhf in _funcs] hf['text'] = token.orig_text.encode('utf-8') hf['id'] = token.id hf['POS'] = '_'.join([str(_pos_codes[x]) for x in set(token.getPOStags())]) # может быть несколько разборов, это плохой вариант hf['gnc'] = gnc(token) hf['gender'], hf['number'], hf['case'] = hf['gnc'] head_funcs[token.id] = hf if in_pair[0]: antc = np_head[in_pair[1]] anph = np_head[in_pair[2]] hf['c_agr'] = agreement(head_funcs[antc]['case'], head_funcs[anph]['case']) hf['g_agr'] = agreement(head_funcs[antc]['gender'], head_funcs[anph]['gender']) hf['n_agr'] = agreement(head_funcs[antc]['number'], head_funcs[anph]['number']) hf['agreement'] = hf['g_agr'] and hf['c_agr'] and hf['n_agr'] hf['gn_agr'] = hf['g_agr'] and hf['n_agr'] hf['match'] = head_funcs[antc]['text'] == head_funcs[anph]['text'] head_funcs[token.id] = hf func = 'POS isNomn isNpro g_agr n_agr c_agr agreement verbs oids conjs nouns'.split() for i, f in enumerate(pairs[0].keys()): hhf = head_funcs[np_head[f]] for a in pairs[0][f]: ana = head_funcs[np_head[a]] if not i: print 'id' + '\t' + '\t'.join(str(x[0]) for x in sorted(filter(lambda x: x[0] in func, hhf.items() + ana.items()), key=lambda x: x[0])) print str(f) + '_' + str(a) + '\t' + \ '\t'.join(str(x[1]) for x in sorted(filter(lambda x: x[0] in func, hhf.items() + ana.items()), key=lambda x: x[0]))
def cut_long_dialogs(data_path:str, output_path:str, max_len:int=128): print('Reading data...') data = read_corpus(data_path) print('Splitting dialogs...') dialogs = [d.split('|') for d in data] print('Cutting data...') dialogs = [cut_dialog(d, max_len) for d in tqdm(dialogs)] num_dialogs_before = len(dialogs) print('Saving data...') dialogs = list(filter(len, dialogs)) save_corpus(dialogs, output_path) print('Done! Num dialogs reduced: {} -> {}'.format(num_dialogs_before, len(dialogs)))
def dialogs_from_lines(input_data_path:str, output_data_path:str, n_lines: int, eos:str, n_dialogs:int): n_lines, n_dialogs = int(n_lines), int(n_dialogs) # TODO: argparse? print('Reading data...') lines = read_corpus(input_data_path) lines = lines[:n_dialogs * n_lines] print('Generating dialogs') dialogs = [lines[i:i+n_lines] for i in range(0, len(lines) - n_lines)] dialogs = [eos.join(d) for d in dialogs] print('Saving corpus') save_corpus(dialogs, output_data_path) print('Done!')
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu")
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] detokenizer = TreebankWordDetokenizer() detokenizer.DOUBLE_DASHES = (re.compile(r'--'), r'--') hyp_sent = detokenizer.detokenize(top_hyp.value) # hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def main(): print('Invoked as:', ' '.join(sys.argv), file=sys.stderr) parser = argparse.ArgumentParser() parser.add_argument('corpus') parser.add_argument('dev_corpus') parser.add_argument('--layers', type=int, default=1) parser.add_argument('--emb_dim', type=int, default=128) parser.add_argument('--hidden_dim', type=int, default=128) parser.add_argument('--minibatch_size', type=int, default=1) parser.add_argument('--tied', action='store_true') parser.add_argument('--autobatch', action='store_true') parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--output', type=str, default='') harness.add_optimizer_args(parser) args = parser.parse_args() if args.output == '': args.output = '/tmp/model%d' % random.randint(0, 0xFFFF) print('Output file:', args.output, file=sys.stderr) vocab = Vocabulary() train_corpus = read_corpus(args.corpus, vocab) dev_corpus = read_corpus(args.dev_corpus, vocab) print('Vocab size:', len(vocab), file=sys.stderr) with open(args.output + '.vocab', 'w') as f: for word in vocab.i2w: print(word, file=f) pc = dy.ParameterCollection() optimizer = harness.make_optimizer(args, pc) model = RNNLM(pc, args.layers, args.emb_dim, args.hidden_dim, len(vocab), args.tied) print('Total parameters:', pc.parameter_count(), file=sys.stderr) harness.train(model, train_corpus, dev_corpus, optimizer, args)
def multi_parameter_tuning(args): lrs = [1e-2,1e-3,5e-3,1e-4,5e-4] hidden_sizes = [128,256,512] lr_decays = [0.9,0.7,0.5] iter = 0 valid_metric = {} # 存储各个模型ppl的值 dev_data_src = read_corpus(args['dev_source'], source='src') dev_data_tgt = read_corpus(args['dev_target'], source='tgt') dev_data = list(zip(dev_data_src, dev_data_tgt)) for i in lrs: for j in hidden_sizes: for k in lr_decays: print('第%d次测试================================================='%iter) arg_test = args arg_test['lr'],arg_test['hidden_size'],arg_test['lr_decay'] = i,j,k arg_test['save_to'] = 'model_'+'lr_'+str(i)+'hd_size_'+str(j)+'lr_dys_'+str(k)+'.bin' run.train(arg_test) model = NMT.load(args['save_to']) dev_ppl = run.evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric[arg_test['save_to']] = dev_ppl print(arg_test['save_to'],' validation: iter %d, dev. ppl %f' % (iter, dev_ppl), file=sys.stderr) iter += 1 model = min(valid_metric,key=valid_metric.get()) print('best_model is %s ,ppl is %f'%(model,valid_metric[model]))
def decode(args: Dict[str, str]): """ performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. """ threshold = 2.0 test_data = read_corpus(args['TEST_SOURCE_FILE'], source='src') print(f"load model from {args['MODEL_PATH_I']}", file=sys.stderr) model_I = NMT.load(args['MODEL_PATH_I']) model_I.encoder.dropout = nn.Dropout(0.) ces_I = [] with torch.no_grad(): for sent in tqdm(test_data, desc='Decoding', file=sys.stdout): loss = model_I([sent]).item() ce = loss / len(sent) ces_I.append(ce) print(f"load model from {args['MODEL_PATH_N']}", file=sys.stderr) model_N = NMT.load(args['MODEL_PATH_N']) model_N.encoder.dropout = nn.Dropout(0.) ces_N = [] with torch.no_grad(): for sent in tqdm(test_data, desc='Decoding', file=sys.stdout): loss = model_N([sent]).item() ce = loss / len(sent) ces_N.append(ce) ces_diff = [] for ce_I, ce_N in zip(ces_I, ces_N): ces_diff.append(ce_I - ce_N) selected = 0 with open(args['OUTPUT_FILE'], 'w') as f: for words, ce in zip(test_data, ces_diff): if (ce < threshold): selected += 1 words = words[1:-1:1] sent = ("".join(words)).replace("▁", " ▁").strip() # f.write(str(ce) + ' ') f.write(sent + '\n') print("%d out of %d sentences selected." % (selected, len(test_data)))
def save(self, file_path): json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2) @staticmethod def load(file_path): entry = json.load(open(file_path, 'r')) src_word2id = entry['src_word2id'] tgt_word2id = entry['tgt_word2id'] return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id)) def __repr__(self): return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt)) if __name__ == '__main__': args = docopt(__doc__) print('read in source sentences: %s' % args['--train-src']) print('read in target sentences: %s' % args['--train-tgt']) src_sents = read_corpus(args['--train-src'], source='src') tgt_sents = read_corpus(args['--train-tgt'], source='tgt') vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff'])) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) vocab.save(args['VOCAB_FILE']) print('vocabulary saved to %s' % args['VOCAB_FILE'])
try: a = stat[i][distance][context] except KeyError: continue bestrule = curr_rule top_rules[bestrule] = a return top_rules, bestscore, a def random_choice(corpus): for s in corpus: for token in s: try: if token.has_ambig(): token.disambiguate(random.choice(token.getPOStags().split('_'))) except: pass if __name__ == '__main__': inc = read_corpus(sys.stdin) #s = context_stats(inc, join_context=True) #print s #rs = scores(s, []) #pprint(rs[1:3]) '''for x in rs[1].keys(): print x.display() print rs[1][x] print rs[2]''' r = Rule('ADVB NOUN', 'NOUN', (1, 'PNCT'), 0)
#idf = np.log(float(N)/appCorpus) weights = np.zeros(tf.shape, dtype='float') # init array for weights for i in range(0, tf.shape[0]): lengthD = np.sum(tf[i,:]) weights[i,:] = idf * tf[i,:] * (k1+1) / (tf[i,:] + k1 * (1 - b + b * lengthD/avgD)) return weights def run_model(model_name, vocab, tf): speech_info = utils.read_speech_info() N = len(speech_info) # number of docs in corpus weights = [] if model_name == "bm25": weights = bm25(tf) elif model_name == "tfidf": weights = tfidf(tf) top_words = {} for i in range(N): gen_tags = utils.get_tags(vocab, weights, i) top_words[speech_info[i]] = gen_tags return top_words if __name__ == '__main__': (vocab,tf) = utils.read_corpus() #weights = tfidf(tf) #topWords = utils.get_tags(vocab, weights, 0) # get top 20 words for Obama's Acceptance Speech #print topWords run_model("tfidf", vocab, tf) #speech_info = utils.read_speech_info() #for si in speech_info: #print speech_info[si]
def train(args: Dict): train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), input_feed=args['--input-feed'], label_smoothing=float(args['--label-smoothing']), vocab=vocab) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) # (batch_size) example_losses = -model(src_sents, tgt_sents) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay']) print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def t(f): f = read_corpus(f) return n(f)[0]
args = p.parse_args() #name = os.path.split(args.corpus)[1] name = args.corpus path = '.' write = False n = 0 #out = sys.stdout copyfile(name, '%s.orig' % name) out = open('%s.rules' % name, 'w') i = 0 best_rules = [] best_score = 0 orig = open(args.corpus, 'r') inc = list(read_corpus(orig)) orig.close() while True: context_freq = context_stats(inc, f=args.f) scores_rule = scores(context_freq, best_rules, f=args.f) #ss = scores_rule[0] best_rule = scores_rule[0] for r in best_rule.keys(): best_rules.append(r) best_score = scores_rule[1] applied = scores_rule[2] if best_score <= 0: output = open('%s.final' % name, 'w')