예제 #1
0
                break

            # if id < 200000:
            #     output_dir = config['baseline_data_path'] + '/maui/ke20k/train(200k)/'
            #     if not os.path.exists(output_dir):
            #         os.makedirs(output_dir)
            #     with open(output_dir+ str(id) + '.txt', 'w') as rf:
            #         rf.write(record['title']+' \n '+record['abstract'])
            #     with open(output_dir+ str(id) + '.key', 'w') as rf:
            #         for k in record['keyword'].split(';'):
            #             rf.write('%s\t1\n' % k)


if __name__ == '__main__':
    # config = config.setup_keyphrase_all()
    config = config.setup_keyphrase_all()

    export_data_for_maui()

    '''
    examine the data
    '''
    # start_time = time.clock()
    # train_set, test_set, idx2word, word2idx = load_data_and_dict(config['training_dataset'], config['testing_dataset'])
    # serialize_to_file([train_set, test_set, idx2word, word2idx], config['dataset'])
    # print('Finish processing and dumping: %d seconds' % (time.clock()-start_time))
    #
    # # export vocabulary to file for manual check
    # wordfreq = sorted(wordfreq.items(), key=lambda a: a[1], reverse=True)
    # serialize_to_file(wordfreq, config['voc'])
    # with open(config['path']+'/dataset/keyphrase/voc_list.json', 'w') as voc_file:
예제 #2
0
        :return:
        '''
        count_1 = 0
        count_5 = 0
        count_10 = 0
        total = 0
        max = 0.
        with open(filename, 'r') as f:
            for line in f:
                list = numpy.array(json.loads(line))
                list = list.ravel()
                for e in list:
                    total += 1
                    if abs(e) > 1:
                        count_1 += 1
                    if abs(e) > 5:
                        count_5 += 1
                    if abs(e) > 10:
                        count_10 += 1
                        print(e)
                    if abs(e) > max:
                        max = abs(e)
                        print('new max = %f' % e)
        print('total = %d' % total)
        print('count < 1/5/10 = %d / %d / %d' % (count_1, count_5, count_10))
        print('max = %f' % max)

if __name__ == '__main__':
    config = config.setup_keyphrase_all()  # setup_keyphrase_inspec
    agent = Model()
    agent.load_weight_json(config['weight_json'])
예제 #3
0
def check_data():
    config = setup_keyphrase_all()
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(
        config['dataset'])

    for dataset_name in config['testing_datasets']:
        print('*' * 50)
        print(dataset_name)

        number_groundtruth = 0
        number_present_groundtruth = 0

        loader = testing_data_loader(dataset_name,
                                     kwargs=dict(basedir=config['path']))

        if dataset_name == 'nus':
            docs = loader.get_docs(only_abstract=True, return_dict=False)
        else:
            docs = loader.get_docs(return_dict=False)

        stemmer = PorterStemmer()

        for id, doc in enumerate(docs):

            text_tokens = dataset_utils.get_tokens(doc.title.strip() + ' ' +
                                                   doc.text.strip())
            # if len(text_tokens) > 1500:
            #     text_tokens = text_tokens[:1500]
            print('[%d] length= %d' % (id, len(doc.text)))

            stemmed_input = [
                stemmer.stem(t).strip().lower() for t in text_tokens
            ]

            phrase_str = ';'.join([l.strip() for l in doc.phrases])
            phrases = dataset_utils.process_keyphrase(phrase_str)
            targets = [[stemmer.stem(w).strip().lower() for w in target]
                       for target in phrases]

            present_targets = []

            for target in targets:
                keep = True
                # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing
                match = None
                for i in range(len(stemmed_input) - len(target) + 1):
                    match = None
                    for j in range(len(target)):
                        if target[j] != stemmed_input[i + j]:
                            match = False
                            break
                    if j == len(target) - 1 and match == None:
                        match = True
                        break

                if match == True:
                    # if match and 'appear-only', keep this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and True
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and False
                elif match == False:
                    # if not match and 'appear-only', discard this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and False
                    # if not match and 'non-appear-only', keep this phrase
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and True

                if not keep:
                    continue

                present_targets.append(target)

            number_groundtruth += len(targets)
            number_present_groundtruth += len(present_targets)

        print('number_groundtruth=' + str(number_groundtruth))
        print('number_present_groundtruth=' + str(number_present_groundtruth))
        '''
예제 #4
0
    for i in range(len(source_text) - 1):
        for j in range(i + 1, len(source_text)):
            if j - i > max_len:
                continue
            if j - i == 1 and (source_text[i:j] == '<digit>'
                               or len(source_text[i:j][0]) == 1):
                continue
            tagseq = ''.join(source_postag[i:j])
            if re.match(np_regex, tagseq):
                np_list.append((source_text[i:j], source_postag[i:j]))

    print('Text: \t\t %s' % str(source_text))
    print('None Phrases:[%d] \n\t\t\t%s' %
          (len(np_list),
           str('\n\t\t\t'.join(
               [str(p[0]) + '[' + str(p[1]) + ']' for p in np_list]))))

    return np_list


if __name__ == '__main__':
    config = setup_keyphrase_all()
    test_set = db.deserialize_from_file(config['path'] +
                                        '/dataset/keyphrase/' +
                                        config['data_process_name'] +
                                        'semeval.testing.pkl')
    for s_index, s_str, s_tag in zip(test_set['source'],
                                     test_set['source_str'],
                                     [[s[1] for s in d]
                                      for d in test_set['tagged_source']]):
        get_none_phrases(s_str, s_tag, config['max_len'])