def evaluate_files(infile, corr_file, sep=" "): test_data, corr_data = read_infile(infile), read_infile(corr_file) for elem in test_data: elem[1] = elem[1].split(sep) return evaluate(test_data, corr_data)
from argparse import ArgumentParser from read import read_infile from training import measure_quality from write import write_output argument_parser = ArgumentParser() argument_parser.add_argument("-g", "--gold", required=True) argument_parser.add_argument("-p", "--pred", required=True) argument_parser.add_argument("-s", "--sep", default="/") argument_parser.add_argument("-S", "--pred_sep", default=None) argument_parser.add_argument("-l", "--language", default=None) argument_parser.add_argument("-o", "--output_file", default=None) if __name__ == "__main__": args = argument_parser.parse_args() if args.pred_sep is None: args.pred_sep = args.sep gold_data = read_infile(args.gold, morph_sep=args.sep, language=args.language) pred_data = read_infile(args.pred, morph_sep=args.pred_sep, language=args.language) for i, (first, second) in enumerate(zip(gold_data, pred_data)): if first["word"] != second["word"]: raise ValueError(f"Incorrect input f{second} for instance f{i}, f{first} expected.") gold_labels = [elem["bmes_labels"] for elem in gold_data] pred_labels = [elem["bmes_labels"] for elem in pred_data] print(measure_quality(gold_labels, pred_labels, measure_last=False)) if args.output_file is not None: words = [word_data["word"] for word_data in gold_data] write_output(words, gold_labels, pred_labels, args.output_file)
argument_parser.add_argument("--eval_every_n_batches", default=-1, type=int) METRICS = ["accuracy", "P", "R", "F", "loss"] def initialize_metrics(): metrics = {key: 0 for key in METRICS + ["n", "n_batches"]} metrics.update({'labels': [], 'pred_labels': []}) return metrics def get_status(corr, pred): return ("T" if corr == pred else "F") + ("P" if corr else "N") if __name__ == '__main__': args = argument_parser.parse_args() train_data = read_infile(args.train_file) dev_data = read_infile(args.dev_file) tokenizer = AutoTokenizer.from_pretrained(args.model_name, return_token_type_ids=True) model = AutoModel.from_pretrained(args.model_name) train_dataset = make_dataset(tokenizer, train_data, pos_label=args.pos_label, answer_field=args.answer_field, first_key=args.first_sentence, second_key=args.second_sentence, device="cuda:0") dev_dataset = make_dataset(tokenizer, dev_data, pos_label=args.pos_label, answer_field=args.answer_field, first_key=args.first_sentence, second_key=args.second_sentence, device="cuda:0")
if self.to_rerank: curr_scores = scores[start + active_indexes] active_indexes = active_indexes[np.argsort(curr_scores)] answer.append([data[i][0][j] for j in active_indexes]) # print(answer[0]) # sys.exit() return answer def rerank_with_lm(self, answer, test_data): data_for_reranking = [ ([x[0] for x in predictions], source[2]) for source, predictions in zip(test_data, answer) ] reranked_predictions = self.rerank(data_for_reranking) new_answer = [] for elem, filtered_words in zip(answer, reranked_predictions): new_elem = [] for word in filtered_words: for prediction in elem: if prediction[0] == word: new_elem.append(prediction) break new_answer.append(new_elem) return new_answer if __name__ == "__main__": infile = "conll2018/task1/all/belarusian-train-medium" data = read_infile(infile) paradigm_checker = ParadigmChecker() paradigm_checker.train(data)
return config def append_model_number(s, index): if "." in s: stem, suffix = s.rsplit(".", maxsplit=1) else: stem, suffix = s, "" return stem + f"_{index}{suffix}" if __name__ == "__main__": args = argument_parser.parse_args() # reading data train_data = read_infile( args.train_file, language=args.language, morph_sep=args.sep) if args.train_file is not None else None dev_data = read_infile( args.dev_file, language=args.language, morph_sep=args.sep) if args.dev_file is not None else None unimorph_data = read_unimorph_infile( args.unimorph_file, n=args.n_unimorph) if args.unimorph_file is not None else None test_data = read_infile( args.test_file, language=args.language, morph_sep=args.sep) if args.test_file is not None else dev_data # reading model config config = read_config(args.config_path) bert_params = {key: config[key] for key in ["vocab", "embeddings"]} dataset_params = { key: config[key]
for i in range(2, max_length + 1): lemma_ngrams = extract_ngrams(lemma, i) word_ngrams = extract_ngrams(word, i) has_bad_ngrams = 0 for ngram in word_ngrams: total_ngram_count[i - 2] += 1 if ngram not in source_ngrams[i - 2] and ngram not in lemma_ngrams: bad_ngram_count[i - 2] += 1 has_bad_ngrams += 1 if has_bad_ngrams == min_bad_count[i - 2]: bad_word_count[i - 2] += 1 # has_bad_ngrams = True for L in range(2, max_length + 1): print("{} bad {}-grams out of {}".format(bad_ngram_count[L - 2], L, total_ngram_count[L - 2])) print("{} words with bad {}-grams out of {}".format( bad_word_count[L - 2], L, len(test_data))) if __name__ == "__main__": languages = ["belarusian"] for language in languages: source_data = read_infile( os.path.join("conll2018", "task1", "all", "{}-train-low".format(language))) dev_data = read_infile( os.path.join("conll2018", "task1", "all", "{}-dev".format(language))) print(language) analyze_ngrams(source_data, dev_data, 4, [1, 2, 2])
def transform(self, data): return [self.transform_string(x) for x in data] def transform_string(self, x): x = ['BOW'] + list(x) + ['EOW'] answer = [] curr, pos, root_pos = 0, 0, 0 while pos < len(x): child = self.trie_nodes[curr].get(x[pos]) if child is None: if self.is_node_terminal[curr] is not None: answer.append('#{}'.format(self.is_node_terminal[curr])) root_pos = pos else: answer.append(x[root_pos]) root_pos += 1 curr, pos = 0, root_pos else: curr, pos = child, pos + 1 if curr != 0: answer.extend(x[root_pos:]) return tuple(answer) if __name__ == "__main__": data = read_infile("conll2018/task1/all/belarusian-train-medium") data = [x for elem in data for x in elem[:2]] pair_encoder = PairEncoder(min_count=50, max_number=50) pair_encoder.fit(data[:]) for elem in data[:20]: print(elem, " ".join("_".join(pair_encoder.symbol_repr(x)) for x in pair_encoder.transform_string(elem)))
} } KEYS = ["labels", "feats", "reverse", "bigrams"] if __name__ == "__main__": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 kbt.set_session(tf.Session(config=config)) languages = ["belarusian"] corr_dir = os.path.join("conll2018", "task1", "all") use_label, use_bigram_loss, reverse = True, True, True for language in languages: infile = os.path.join(corr_dir, "{}-train-{}".format(language, mode)) data = read_infile(infile, for_lm=True) dev_file = os.path.join(corr_dir, "{}-dev".format(language, mode)) dev_data = read_infile(dev_file, for_lm=True) for (use_bigram_loss, use_feats) in itertools.product([False, True], [False, True]): model = NeuralLM(use_bigram_loss=use_bigram_loss, use_label=use_label, use_feats=use_feats, nepochs=30, reverse=reverse) model.train(data, dev_data) answer = model.predict(dev_data, return_letter_scores=True) os.makedirs("dump", exist_ok=True) outfile = "probs" FLAGS = [use_label, use_feats, reverse, use_bigram_loss] for key, flag in zip(KEYS, FLAGS):
to_rerank_with_lm = val elif opt == "-s": evaluate_on_submission = True if languages is None: languages = [elem.rsplit("-", maxsplit=2) for elem in os.listdir(corr_dir)] languages = [(elem[0], elem[2]) for elem in languages if elem[1] == "train" and len(elem) >= 3] params = read_params(args[0]) results = [] model_format_string = '{1}-{2}' if model_name is None else '{0}-{1}-{2}' print(sorted(languages)) metrics = [] for language, mode in sorted(languages): print(language, mode) infile = os.path.join(corr_dir, "{}-train-{}".format(language, mode)) test_file = os.path.join(corr_dir, "{}-dev".format(language)) data, dev_data, test_data = read_infile(infile), None, read_infile(test_file) dev_data = test_data # data_for_alignment = [elem[:2] for elem in data] # aligner = Aligner(n_iter=1, separate_endings=True, init="lcs", # init_params={"gap": 2, "initial_gap": 3}) # aligned_data = aligner.align(data_for_alignment, save_initial=False) filename = model_format_string.format(model_name, language, mode) load_file = os.path.join(load_dir, filename + ".json") if load_dir is not None else None if load_file and os.path.exists(load_file): inflector = load_inflector(load_file, verbose=0) for param in ["nepochs", "batch_size"]: value = params["model"].get(param) if value is not None: inflector.__setattr__(param, value) else: lm_dir = params.get("lm_dir")