예제 #1
0
    def __call__(self, idx2word, word2idx, type=1):
        self.get_docs()

        pairs = []

        for doc in self.doclist:
            try:
                title = utils.get_tokens(doc.title, type)
                text = utils.get_tokens(doc.text, type)
                if type == 0:
                    title.append('<eos>')
                elif type == 1:
                    title.append('.')

                title.extend(text)
                text = title

                # trunk, many texts are too long, would lead to out-of-memory
                if len(text) > 1500:
                    text = text[:1500]

                keyphrases = [utils.get_tokens(k, type) for k in doc.phrases]
                pairs.append((text, keyphrases))

            except UnicodeDecodeError:
                print('UnicodeDecodeError detected! %s' % doc.name)
            # print(text)
            # print(keyphrases)
            # print('*'*50)
        dataset = utils.build_data(pairs, idx2word, word2idx)

        return dataset, self.doclist
def load_additional_testing_data(testing_names, idx2word, word2idx, config, postagging=True, process_type=1):
    test_sets           = {}

    # rule out the ones appear in testing data
    for dataset_name in testing_names:

        if os.path.exists(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'):
            test_set = deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')
            print('Loading testing dataset %s from %s' % (dataset_name, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'))
        else:
            print('Creating testing dataset %s: %s' % (dataset_name, config['path'] + '/dataset/keyphrase/' + config[
                'data_process_name'] + dataset_name + '.testing.pkl'))
            dataloader          = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path']))
            records             = dataloader.get_docs()
            records, pairs, _   = utils.load_pairs(records, process_type=process_type, do_filter=False)
            test_set            = utils.build_data(pairs, idx2word, word2idx)

            test_set['record']  = records

            if postagging:
                tagged_sources = get_postag_with_record(records, pairs)
                test_set['tagged_source']   = [[t[1] for t in s] for s in tagged_sources]

                if hasattr(dataloader, 'text_postag_dir') and dataloader.__getattribute__('text_postag_dir') != None:
                    print('Exporting postagged data to %s' % (dataloader.text_postag_dir))
                    if not os.path.exists(dataloader.text_postag_dir):
                        os.makedirs(dataloader.text_postag_dir)
                    for r_, p_, s_ in zip(records, pairs, tagged_sources):
                        with open(dataloader.text_postag_dir+ '/' + r_['name'] + '.txt', 'w') as f:
                            output_str = ' '.join([w+'_'+t for w,t in s_])
                            f.write(output_str)
                else:
                    print('text_postag_dir not found, no export of postagged data')

            serialize_to_file(test_set, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')

        test_sets[dataset_name] = test_set

    return test_sets
def load_data_and_dict(training_dataset):
    '''
    here dict is built on both training and testing dataset, which may be not suitable (testing data should be unseen)
    :param training_dataset,testing_dataset: path
    :return:
    '''
    # load training dataset
    print('Loading training dataset')
    f = open(training_dataset, 'r')
    training_records = json.load(f)
    title_dict = dict([(r['title'].strip().lower(), r)
                       for r in training_records])
    print('#(Training Data)=%d' % len(title_dict))

    # load testing dataset
    print('Loading testing dataset')
    testing_names = config[
        'testing_datasets']  # only these three may have overlaps with training data
    testing_records = {}

    # rule out the ones appear in testing data: 'inspec', 'krapivin', 'nus', 'semeval'
    print('Filtering testing dataset from training data')
    for dataset_name in testing_names:
        print(dataset_name)

        testing_records[dataset_name] = testing_data_loader(
            dataset_name, kwargs=dict(basedir=config['path'])).get_docs()

        for r in testing_records[dataset_name]:
            title = r['title'].strip().lower()
            if title in title_dict:
                title_dict.pop(title)

    print('Process the data')
    training_records, train_pairs, wordfreq = dataset_utils.load_pairs(
        title_dict.values(), do_filter=True)
    print('#(Training Data after Filtering Noises)=%d' % len(training_records))

    print('Preparing development data')
    training_records = numpy.asarray(training_records)
    train_pairs = numpy.asarray(train_pairs)
    # keep a copy of validation data
    if 'validation_id' in config and os.path.exists(config['validation_id']):
        validation_ids = deserialize_from_file(config['validation_id'])
        validation_records = training_records[validation_ids]
        serialize_to_file(
            validation_records, config['path'] + '/dataset/keyphrase/' +
            config['data_process_name'] + 'validation_record_' +
            str(config['validation_size']) + '.pkl')
        exit()
    else:
        validation_ids = numpy.random.randint(0, len(training_records),
                                              config['validation_size'])
        serialize_to_file(validation_ids, config['validation_id'])

    validation_records = training_records[validation_ids]
    validation_pairs = train_pairs[validation_ids]
    training_records = numpy.delete(training_records, validation_ids, axis=0)
    train_pairs = numpy.delete(train_pairs, validation_ids, axis=0)

    #
    # target_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/ke20k/train/'
    # for r_id, r in enumerate(validation_records):
    #     with open(target_dir+r_id+'.txt', 'w') as textfile:
    #         textfile.write(r.title+'\n'+r.text)
    #     with open(target_dir + r_id + '.key', 'w') as phrasefile:
    #         for p in r.phrases:
    #             phrasefile.write('%s\t1\n' % p)

    print('#(Training Data after Filtering Validate & Test data)=%d' %
          len(train_pairs))

    print('Preparing testing data KE20k')
    # keep a copy of testing data
    if 'testing_id' in config and os.path.exists(config['testing_id']):
        testing_ids = deserialize_from_file(config['testing_id'])
        testing_ids = filter(lambda x: x < len(training_records), testing_ids)
    else:
        testing_ids = numpy.random.randint(0, len(training_records),
                                           config['validation_size'])
        serialize_to_file(testing_ids, config['testing_id'])
    testing_records['ke20k'] = training_records[testing_ids]
    training_records = numpy.delete(training_records, testing_ids, axis=0)
    train_pairs = numpy.delete(train_pairs, testing_ids, axis=0)

    # path = '/home/memray/Project/deep_learning/seq2seq-keyphrase/dataset/keyphrase/baseline-data/ke20k/'
    # keyphrase_count = 0
    # for i,r in enumerate(testing_records['ke20k']):
    #     with open(path+'text/'+ str(i) +'.txt', 'w') as f:
    #         f.write(r['title']+'. \n'+r['abstract'])
    #     with open(path+'keyphrase/'+ str(i) +'.txt', 'w') as f:
    #         keyphrases = re.sub(r'\(.*?\)', ' ', r['keyword'])
    #         keyphrases = re.split('[,;]',keyphrases)
    #         keyphrase_count += len(keyphrases)
    #         f.write('\n'.join(keyphrases))
    #
    # print('length of testing ids: %d' % len(testing_ids))
    # print('length of actually testing samples: %d' % len(testing_records['ke20k']))
    # print('average number of keyphrases: %f' % (float(keyphrase_count)/ float(len(testing_records['ke20k']))))
    # exit()

    test_pairs = dict([(k, dataset_utils.load_pairs(v, do_filter=False)[1])
                       for (k, v) in testing_records.items()])

    print('Building dicts')
    # if voc exists and is assigned, load it, overwrite the wordfreq
    if 'voc' in config:
        print('Loading dicts from %s' % config['voc'])
        wordfreq = dict(deserialize_from_file(config['voc']))
    idx2word, word2idx = build_dict(wordfreq)

    # use character-based model [on]
    # use word-based model     [off]
    print('Mapping tokens to indexes')
    train_set = dataset_utils.build_data(train_pairs, idx2word, word2idx)
    validation_set = dataset_utils.build_data(validation_pairs, idx2word,
                                              word2idx)
    test_set = dict([(k, dataset_utils.build_data(v, idx2word, word2idx))
                     for (k, v) in test_pairs.items()])

    print('Train samples      : %d' % len(train_pairs))
    print('Validation samples : %d' % len(validation_pairs))
    print('Test samples       : %d' %
          sum([len(test_pair) for test_pair in test_pairs.values()]))
    print('Dict size          : %d' % len(idx2word))

    return train_set, validation_set, test_set, idx2word, word2idx