示例#1
0
def creat_word_rel_dict(r_file, *q_files):
    word_dict = Dictionary()
    word_dict.add_unk_token()
    word_dict.add_pad_token()
    word_dict.add_start_token()

    for q_file in q_files:
        qa_data = pickle.load(open(q_file, 'rb'))
        for data in qa_data:
            q = data.question
            tokens = q.split(' ')
            for token in tokens:
                word_dict.add(token)
    print(len(word_dict))

    rels = pickle.load(open(r_file, 'rb'))
    for rel in rels:
        rel_word = []
        w = rel[3:].split('.')
        for i in w:
            rel_word.extend(i.split('_'))
        for word in rel_word:
            word_dict.add(word)
    print(len(word_dict))
    return word_dict
示例#2
0
        print_data.append({
            'Predicted Subject Name': tagged_subject_name,
            'Original Subject Name': row['subject_name'],
            'Normalized Edit Distance': normalized_edit_distance,
            'Question Tokens': row['question_tokens'],
        })

    exact_match = [
        d for d in print_data if d['Normalized Edit Distance'] == 1.0
    ]

    # get the word dictionary
    word_vocab = Dictionary()
    word_vocab.add_unk_token()
    word_vocab.add_pad_token()
    word_vocab.add_start_token()
    word_vocab.add_end_token()
    word_vocab.add("<e>")

    add_word(df_dev)
    add_word(df_test)
    add_word(df_train)

    torch.save(word_vocab, "../../data/vocab/word_vocab.pt")

    # get the training data and test data
    get_formatted_examples(128, '../../data/subject_recognition/dev.pt',
                           df_dev)
    get_formatted_examples(128, '../../data/subject_recognition/test.pt',
                           df_test)
    get_formatted_examples(128, '../../data/subject_recognition/train.pt',