def evaluate_files(infile, corr_file, sep=" "):
    test_data, corr_data = read_infile(infile), read_infile(corr_file)
    for elem in test_data:
        elem[1] = elem[1].split(sep)
    return evaluate(test_data, corr_data)
Пример #2
0
from argparse import ArgumentParser

from read import read_infile
from training import measure_quality
from write import write_output

argument_parser = ArgumentParser()
argument_parser.add_argument("-g", "--gold", required=True)
argument_parser.add_argument("-p", "--pred", required=True)
argument_parser.add_argument("-s", "--sep", default="/")
argument_parser.add_argument("-S", "--pred_sep", default=None)
argument_parser.add_argument("-l", "--language", default=None)
argument_parser.add_argument("-o", "--output_file", default=None)

if __name__ == "__main__":
    args = argument_parser.parse_args()
    if args.pred_sep is None:
        args.pred_sep = args.sep
    gold_data = read_infile(args.gold, morph_sep=args.sep, language=args.language)
    pred_data = read_infile(args.pred, morph_sep=args.pred_sep, language=args.language)
    for i, (first, second) in enumerate(zip(gold_data, pred_data)):
        if first["word"] != second["word"]:
            raise ValueError(f"Incorrect input f{second} for instance f{i}, f{first} expected.")
    gold_labels = [elem["bmes_labels"] for elem in gold_data]
    pred_labels = [elem["bmes_labels"] for elem in pred_data]
    print(measure_quality(gold_labels, pred_labels, measure_last=False))
    if args.output_file is not None:
        words = [word_data["word"] for word_data in gold_data]
        write_output(words, gold_labels, pred_labels, args.output_file)
Пример #3
0
argument_parser.add_argument("--eval_every_n_batches", default=-1, type=int)

METRICS = ["accuracy", "P", "R", "F", "loss"]

def initialize_metrics():
    metrics = {key: 0 for key in METRICS + ["n", "n_batches"]}
    metrics.update({'labels': [], 'pred_labels': []})
    return metrics

def get_status(corr, pred):
    return ("T" if corr == pred else "F") + ("P" if corr else "N")


if __name__ == '__main__':
    args = argument_parser.parse_args()
    train_data = read_infile(args.train_file)
    dev_data = read_infile(args.dev_file)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, return_token_type_ids=True)
    model = AutoModel.from_pretrained(args.model_name)

    train_dataset = make_dataset(tokenizer, train_data, pos_label=args.pos_label, 
                                 answer_field=args.answer_field, 
                                 first_key=args.first_sentence,
                                 second_key=args.second_sentence,
                                 device="cuda:0")
    dev_dataset = make_dataset(tokenizer, dev_data, pos_label=args.pos_label, 
                               answer_field=args.answer_field, 
                               first_key=args.first_sentence,
                               second_key=args.second_sentence,
                               device="cuda:0")
Пример #4
0
            if self.to_rerank:
                curr_scores = scores[start + active_indexes]
                active_indexes = active_indexes[np.argsort(curr_scores)]
            answer.append([data[i][0][j] for j in active_indexes])
        # print(answer[0])
        # sys.exit()
        return answer

    def rerank_with_lm(self, answer, test_data):
        data_for_reranking = [
            ([x[0] for x in predictions], source[2])
            for source, predictions in zip(test_data, answer)
        ]
        reranked_predictions = self.rerank(data_for_reranking)
        new_answer = []
        for elem, filtered_words in zip(answer, reranked_predictions):
            new_elem = []
            for word in filtered_words:
                for prediction in elem:
                    if prediction[0] == word:
                        new_elem.append(prediction)
                        break
            new_answer.append(new_elem)
        return new_answer


if __name__ == "__main__":
    infile = "conll2018/task1/all/belarusian-train-medium"
    data = read_infile(infile)
    paradigm_checker = ParadigmChecker()
    paradigm_checker.train(data)
Пример #5
0
    return config


def append_model_number(s, index):
    if "." in s:
        stem, suffix = s.rsplit(".", maxsplit=1)
    else:
        stem, suffix = s, ""
    return stem + f"_{index}{suffix}"


if __name__ == "__main__":
    args = argument_parser.parse_args()
    # reading data
    train_data = read_infile(
        args.train_file, language=args.language,
        morph_sep=args.sep) if args.train_file is not None else None
    dev_data = read_infile(
        args.dev_file, language=args.language,
        morph_sep=args.sep) if args.dev_file is not None else None
    unimorph_data = read_unimorph_infile(
        args.unimorph_file,
        n=args.n_unimorph) if args.unimorph_file is not None else None
    test_data = read_infile(
        args.test_file, language=args.language,
        morph_sep=args.sep) if args.test_file is not None else dev_data
    # reading model config
    config = read_config(args.config_path)
    bert_params = {key: config[key] for key in ["vocab", "embeddings"]}
    dataset_params = {
        key: config[key]
        for i in range(2, max_length + 1):
            lemma_ngrams = extract_ngrams(lemma, i)
            word_ngrams = extract_ngrams(word, i)
            has_bad_ngrams = 0
            for ngram in word_ngrams:
                total_ngram_count[i - 2] += 1
                if ngram not in source_ngrams[i -
                                              2] and ngram not in lemma_ngrams:
                    bad_ngram_count[i - 2] += 1
                    has_bad_ngrams += 1
                    if has_bad_ngrams == min_bad_count[i - 2]:
                        bad_word_count[i - 2] += 1
                        # has_bad_ngrams = True
    for L in range(2, max_length + 1):
        print("{} bad {}-grams out of {}".format(bad_ngram_count[L - 2], L,
                                                 total_ngram_count[L - 2]))
        print("{} words with bad {}-grams out of {}".format(
            bad_word_count[L - 2], L, len(test_data)))


if __name__ == "__main__":
    languages = ["belarusian"]
    for language in languages:
        source_data = read_infile(
            os.path.join("conll2018", "task1", "all",
                         "{}-train-low".format(language)))
        dev_data = read_infile(
            os.path.join("conll2018", "task1", "all",
                         "{}-dev".format(language)))
        print(language)
        analyze_ngrams(source_data, dev_data, 4, [1, 2, 2])
Пример #7
0
    def transform(self, data):
        return [self.transform_string(x) for x in data]

    def transform_string(self, x):
        x = ['BOW'] + list(x) + ['EOW']
        answer = []
        curr, pos, root_pos = 0, 0, 0
        while pos < len(x):
            child = self.trie_nodes[curr].get(x[pos])
            if child is None:
                if self.is_node_terminal[curr] is not None:
                    answer.append('#{}'.format(self.is_node_terminal[curr]))
                    root_pos = pos
                else:
                    answer.append(x[root_pos])
                    root_pos += 1
                curr, pos = 0, root_pos
            else:
                curr, pos = child, pos + 1
        if curr != 0:
            answer.extend(x[root_pos:])
        return tuple(answer)

if __name__ == "__main__":
    data = read_infile("conll2018/task1/all/belarusian-train-medium")
    data = [x for elem in data for x in elem[:2]]
    pair_encoder = PairEncoder(min_count=50, max_number=50)
    pair_encoder.fit(data[:])
    for elem in data[:20]:
        print(elem, " ".join("_".join(pair_encoder.symbol_repr(x))
                             for x in pair_encoder.transform_string(elem)))
    }
}

KEYS = ["labels", "feats", "reverse", "bigrams"]

if __name__ == "__main__":
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    kbt.set_session(tf.Session(config=config))
    languages = ["belarusian"]

    corr_dir = os.path.join("conll2018", "task1", "all")
    use_label, use_bigram_loss, reverse = True, True, True
    for language in languages:
        infile = os.path.join(corr_dir, "{}-train-{}".format(language, mode))
        data = read_infile(infile, for_lm=True)
        dev_file = os.path.join(corr_dir, "{}-dev".format(language, mode))
        dev_data = read_infile(dev_file, for_lm=True)
        for (use_bigram_loss, use_feats) in itertools.product([False, True],
                                                              [False, True]):
            model = NeuralLM(use_bigram_loss=use_bigram_loss,
                             use_label=use_label,
                             use_feats=use_feats,
                             nepochs=30,
                             reverse=reverse)
            model.train(data, dev_data)
            answer = model.predict(dev_data, return_letter_scores=True)
            os.makedirs("dump", exist_ok=True)
            outfile = "probs"
            FLAGS = [use_label, use_feats, reverse, use_bigram_loss]
            for key, flag in zip(KEYS, FLAGS):
Пример #9
0
         to_rerank_with_lm = val
     elif opt == "-s":
         evaluate_on_submission = True
 if languages is None:
     languages = [elem.rsplit("-", maxsplit=2) for elem in os.listdir(corr_dir)]
     languages = [(elem[0], elem[2]) for elem in languages if elem[1] == "train" and len(elem) >= 3]
 params = read_params(args[0])
 results = []
 model_format_string = '{1}-{2}' if model_name is None else '{0}-{1}-{2}'
 print(sorted(languages))
 metrics = []
 for language, mode in sorted(languages):
     print(language, mode)
     infile = os.path.join(corr_dir, "{}-train-{}".format(language, mode))
     test_file = os.path.join(corr_dir, "{}-dev".format(language))
     data, dev_data, test_data = read_infile(infile), None, read_infile(test_file)
     dev_data = test_data
     # data_for_alignment = [elem[:2] for elem in data]
     # aligner = Aligner(n_iter=1, separate_endings=True, init="lcs",
     #                   init_params={"gap": 2, "initial_gap": 3})
     # aligned_data = aligner.align(data_for_alignment, save_initial=False)
     filename = model_format_string.format(model_name, language, mode)
     load_file = os.path.join(load_dir, filename + ".json") if load_dir is not None else None
     if load_file and os.path.exists(load_file):
         inflector = load_inflector(load_file, verbose=0)
         for param in ["nepochs", "batch_size"]:
             value = params["model"].get(param)
             if value is not None:
                 inflector.__setattr__(param, value)
     else:
         lm_dir = params.get("lm_dir")