示例#1
0
def show_instances(class_name=''):
    name = 'train'
    if name == 'train':
        filename = Const.origin_all_train_filename
    elif name == 'dev':
        filename = Const.origin_all_dev_filename
    f = open(filename, 'r')
    content = f.readlines()
    html_doc = ' '.join(content)
    sentences_string, triples_string = utils.parse(html_doc)
    sentences_words = utils.sentence_tokenize(sentences_string)
    position_triples = utils.find_entity_position(sentences_words,
                                                  triples_string)
    sentences_word_id, sentence_triples_id = utils.turn2id(
        sentences_words, position_triples)
    utils.triples_type(sentence_triples_id)
    if class_name == 'normal':
        func = utils.is_normal_triple
    elif class_name == 'single_entity_overlap':
        func = utils.is_over_lapping
    else:
        func = utils.is_multi_label

    words2id = utils.load_words2id()
    id2words = {v: k for k, v in words2id.items()}
    for sent_words_id, triples_id in zip(sentences_word_id,
                                         sentence_triples_id):
        if func(triples_id, is_relation_first=False):
            print ' '.join([id2words[x] for x in sent_words_id])
            print triples_id
            print '-----------------------------------'
示例#2
0
def prepare(name):
    print name
    if name == 'train':
        filename = Const.origin_all_train_filename
    if name == 'dev':
        filename = Const.origin_all_dev_filename
    if name == 'example':
        filename = Const.origin_example_filename

    print Const.triple_len
    f = open(filename, 'r')
    print filename
    content = f.readlines()
    html_doc = ' '.join(content)
    sentences_string, triples_string = utils.parse(html_doc)
    sentences_words = utils.sentence_tokenize(sentences_string)
    position_triples = utils.find_entity_position(sentences_words,
                                                  triples_string)
    sentences_word_id, sentence_triples_id = utils.turn2id(
        sentences_words, position_triples)
    if name == 'train':
        #  split train file into train and valid set
        [valid_sentences_word_id, valid_sentence_triples_id
         ], [train_sentences_word_id,
             train_sentence_triples_id] = utils.split(sentences_word_id,
                                                      sentence_triples_id)
        utils.static_triples_info(train_sentence_triples_id)
        utils.triples_type(train_sentence_triples_id)
        utils.static_triples_info(valid_sentence_triples_id)
        utils.triples_type(valid_sentence_triples_id)
        json.dump([train_sentences_word_id, train_sentence_triples_id],
                  open(Const.train_filename, 'w'))
        json.dump([valid_sentences_word_id, valid_sentence_triples_id],
                  open(Const.valid_filename, 'w'))
        utils.instances2nyt_style(
            [train_sentences_word_id, train_sentence_triples_id],
            Const.nyt_style_raw_train_filename)
        utils.instances2nyt_style(
            [valid_sentences_word_id, valid_sentence_triples_id],
            Const.nyt_style_raw_valid_filename)
    elif name == 'dev':
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id],
                  open(Const.dev_filename, 'w'))
        utils.instances2nyt_style([sentences_word_id, sentence_triples_id],
                                  Const.nyt_style_raw_test_filename)
    else:
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id],
                  open(Const.example_filename, 'w'))
示例#3
0
def prepare(name):
    print(name)
    if name == 'train':
        filename = Const.origin_all_train_filename
    if name == 'dev':
        filename = Const.origin_all_dev_filename
    if name == 'example':
        filename = Const.origin_example_filename

    print(Const.triple_len)
    f = open(filename, 'r', encoding='utf-8')
    print(filename)
    content = f.readlines()
    html_doc = ' '.join(content)

    if not os.path.isfile(name + "_parse.json"):
        sentences_string, triples_string = utils.parse(html_doc)
        json.dump([sentences_string, triples_string], open(name + '_parse.json', 'w', encoding='utf-8'))
    else:
        sentences_string, triples_string = json.load(open(name + '_parse.json', 'r', encoding='utf-8'))

    sentences_words = utils.sentence_tokenize(sentences_string)
    position_triples = utils.find_entity_position(sentences_words, triples_string) # 记录实体最后一个单词的索引(e1_end, e2_end, relation)
    sentences_word_id, sentence_triples_id = utils.turn2id(sentences_words, position_triples) # 将句子、关系类型转为id
    if name == 'train':
        #  split train file into train and valid set
        [valid_sentences_word_id, valid_sentence_triples_id], [train_sentences_word_id, train_sentence_triples_id] = utils.split(sentences_word_id, sentence_triples_id)
        utils.static_triples_info(train_sentence_triples_id)
        utils.triples_type(train_sentence_triples_id)
        utils.static_triples_info(valid_sentence_triples_id)
        utils.triples_type(valid_sentence_triples_id)
        json.dump([train_sentences_word_id, train_sentence_triples_id], open(Const.train_filename, 'w', encoding='utf-8'))
        json.dump([valid_sentences_word_id, valid_sentence_triples_id], open(Const.valid_filename, 'w', encoding='utf-8'))
        utils.instances2nyt_style([train_sentences_word_id, train_sentence_triples_id], Const.nyt_style_raw_train_filename)
        utils.instances2nyt_style([valid_sentences_word_id, valid_sentence_triples_id], Const.nyt_style_raw_valid_filename)
    elif name == 'dev':
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id], open(Const.dev_filename, 'w', encoding='utf-8'))
        utils.instances2nyt_style([sentences_word_id, sentence_triples_id], Const.nyt_style_raw_test_filename)
    else:
        utils.triples_type(sentence_triples_id)
        json.dump([sentences_word_id, sentence_triples_id], open(Const.example_filename, 'w', encoding='utf-8'))