Пример #1
0
    def load(self, filename):
        logger.info('load the weights.')

        # hdf5 module seems works abnormal !!
        # weights = dd.io.load(filename)
        weights = deserialize_from_file(filename)
        print len(weights)
        self.set_weights(weights)
Пример #2
0
def check_postag(config):
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(
        config['dataset'])

    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    for dataset_name in config['testing_datasets']:
        # override the original test_set
        # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])

        test_sets = load_additional_testing_data(config['testing_datasets'],
                                                 idx2word, word2idx, config)
        test_set = test_sets[dataset_name]

        # print(dataset_name)
        # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
        test_data_plain = zip(*(test_set['source'], test_set['target']))

        test_size = len(test_data_plain)

        # Alternatively to setting the CLASSPATH add the jar and model via their path:
        jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
        # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
        model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
        pos_tagger = StanfordPOSTagger(model, jar)

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o = test_data_plain[idx]

            source = keyphrase_utils.cut_zero(test_s_o, idx2word)

            print(source)

            # Add other jars from Stanford directory
            stanford_dir = jar.rpartition('/')[0]
            stanford_jars = find_jars_within_path(stanford_dir)
            pos_tagger._stanford_jar = ':'.join(stanford_jars)

            text = pos_tagger.tag(source)
            print(text)
Пример #3
0
def build_evaluation(train_set, segment):
    _, _, idx2word, word2idx = deserialize_from_file(train_set)
    pairs   = []
    f       = open('./dataset/LCSTS/PART_III/PART_III.txt', 'r')
    line    = f.readline().strip()
    lines   = 0
    segment = segment
    while line:
        if '<human_label>' in line:
            score   = int(line[13])
            if score >= 3:
                f.readline()
                summary = f.readline().strip().decode('utf-8')
                if segment:
                    summary = [w for w in jb.cut(summary)]
                target  = []
                for w in summary:
                    if w not in word2idx:
                        word2idx[w] = len(word2idx)
                        idx2word[len(idx2word)] = w
                    target += [word2idx[w]]

                f.readline()
                f.readline()
                text    = f.readline().strip().decode('utf-8')
                if segment:
                    text = [w for w in jb.cut(text)]
                source  = []
                for w in text:
                    if w not in word2idx:
                        word2idx[w] = len(word2idx)
                        idx2word[len(idx2word)] = w
                    source += [word2idx[w]]

                pair    = (text, summary, score, source, target)
                pairs.append(pair)
                lines  += 1
                if lines % 1000 == 0:
                    print lines
        line = f.readline().strip()
    print 'lines={}'.format(len(pairs))
    return pairs, word2idx, idx2word
def load_additional_testing_data(testing_names, idx2word, word2idx, config, postagging=True, process_type=1):
    test_sets           = {}

    # rule out the ones appear in testing data
    for dataset_name in testing_names:

        if os.path.exists(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'):
            test_set = deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')
            print('Loading testing dataset %s from %s' % (dataset_name, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'))
        else:
            print('Creating testing dataset %s: %s' % (dataset_name, config['path'] + '/dataset/keyphrase/' + config[
                'data_process_name'] + dataset_name + '.testing.pkl'))
            dataloader          = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path']))
            records             = dataloader.get_docs()
            records, pairs, _   = utils.load_pairs(records, process_type=process_type, do_filter=False)
            test_set            = utils.build_data(pairs, idx2word, word2idx)

            test_set['record']  = records

            if postagging:
                tagged_sources = get_postag_with_record(records, pairs)
                test_set['tagged_source']   = [[t[1] for t in s] for s in tagged_sources]

                if hasattr(dataloader, 'text_postag_dir') and dataloader.__getattribute__('text_postag_dir') != None:
                    print('Exporting postagged data to %s' % (dataloader.text_postag_dir))
                    if not os.path.exists(dataloader.text_postag_dir):
                        os.makedirs(dataloader.text_postag_dir)
                    for r_, p_, s_ in zip(records, pairs, tagged_sources):
                        with open(dataloader.text_postag_dir+ '/' + r_['name'] + '.txt', 'w') as f:
                            output_str = ' '.join([w+'_'+t for w,t in s_])
                            f.write(output_str)
                else:
                    print('text_postag_dir not found, no export of postagged data')

            serialize_to_file(test_set, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')

        test_sets[dataset_name] = test_set

    return test_sets
Пример #5
0
def export_krapivin_maui():
    # prepare logging.
    config = keyphrase.config.setup_keyphrase_all()  # load settings.

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = load_additional_testing_data(config['testing_datasets'],
                                             idx2word, word2idx, config)

    # keep the first 400 in krapivin
    dataset = test_sets['krapivin']

    train_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/krapivin/train/'
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    train_texts = dataset['source_str'][401:]
    train_targets = dataset['target_str'][401:]
    for i, (train_text,
            train_target) in enumerate(zip(train_texts, train_targets)):
        print('train ' + str(i))
        with open(train_dir + str(i) + '.txt', 'w') as f:
            f.write(' '.join(train_text))
        with open(train_dir + str(i) + '.key', 'w') as f:
            f.write('\n'.join([' '.join(t) + '\t1' for t in train_target]))

    test_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/krapivin/test/'
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
    test_texts = dataset['source_str'][:400]
    test_targets = dataset['target_str'][:400]
    for i, (test_text, test_target) in enumerate(zip(test_texts,
                                                     test_targets)):
        print('test ' + str(i))
        with open(test_dir + str(i) + '.txt', 'w') as f:
            f.write(' '.join(test_text))
        with open(test_dir + str(i) + '.key', 'w') as f:
            f.write('\n'.join([' '.join(t) + '\t1' for t in test_target]))
Пример #6
0
def export_UTD():
    # prepare logging.
    config = keyphrase.config.setup_keyphrase_all()  # load settings.

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = load_additional_testing_data(config['testing_datasets'],
                                             idx2word, word2idx, config)

    for dataset_name, dataset in test_sets.items():
        print('Exporting %s' % str(dataset_name))

        # keep the first 400 in krapivin
        if dataset_name == 'krapivin':
            dataset['tagged_source'] = dataset['tagged_source'][:400]

        for i, d in enumerate(
                zip(dataset['tagged_source'], dataset['target_str'])):
            source_postag, target = d
            print('[%d/%d]' % (i, len(dataset['tagged_source'])))

            output_text = ' '.join(
                [sp[0] + '_' + sp[1] for sp in source_postag])

            output_dir = config['baseline_data_path'] + dataset_name + '/text/'
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            with open(output_dir + '/' + str(i) + '.txt', 'w') as f:
                f.write(output_text)

            output_text = '\n'.join([' '.join(t) for t in target])
            tag_output_dir = config[
                'baseline_data_path'] + dataset_name + '/keyphrase/'
            if not os.path.exists(tag_output_dir):
                os.makedirs(tag_output_dir)
            with open(tag_output_dir + '/' + str(i) + '.txt', 'w') as f:
                f.write(output_text)
Пример #7
0

# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup()  # load settings.
for w in config:
    print '{0}={1}'.format(w, config[w])

logger = init_logging(config['path_log'] +
                      '/experiments.CopyWeibo.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2**30))
logger.info('Start!')

train_set, test_set, idx2word, word2idx = deserialize_from_file(
    config['dataset'])

if config['voc_size'] == -1:  # not use unk
    config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
    config['dec_voc_size'] = config['enc_voc_size']
else:
    config['enc_voc_size'] = config['voc_size']
    config['dec_voc_size'] = config['enc_voc_size']

samples = len(train_set['source'])
logger.info('build dataset done. ' + 'dataset size: {} ||'.format(samples) +
            'vocabulary size = {0}/ batch size = {1}'.format(
                config['dec_voc_size'], config['batch_size']))


def build_data(data):
Пример #8
0
def check_data():
    config = setup_keyphrase_all()
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(
        config['dataset'])

    for dataset_name in config['testing_datasets']:
        print('*' * 50)
        print(dataset_name)

        number_groundtruth = 0
        number_present_groundtruth = 0

        loader = testing_data_loader(dataset_name,
                                     kwargs=dict(basedir=config['path']))

        if dataset_name == 'nus':
            docs = loader.get_docs(only_abstract=True, return_dict=False)
        else:
            docs = loader.get_docs(return_dict=False)

        stemmer = PorterStemmer()

        for id, doc in enumerate(docs):

            text_tokens = dataset_utils.get_tokens(doc.title.strip() + ' ' +
                                                   doc.text.strip())
            # if len(text_tokens) > 1500:
            #     text_tokens = text_tokens[:1500]
            print('[%d] length= %d' % (id, len(doc.text)))

            stemmed_input = [
                stemmer.stem(t).strip().lower() for t in text_tokens
            ]

            phrase_str = ';'.join([l.strip() for l in doc.phrases])
            phrases = dataset_utils.process_keyphrase(phrase_str)
            targets = [[stemmer.stem(w).strip().lower() for w in target]
                       for target in phrases]

            present_targets = []

            for target in targets:
                keep = True
                # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing
                match = None
                for i in range(len(stemmed_input) - len(target) + 1):
                    match = None
                    for j in range(len(target)):
                        if target[j] != stemmed_input[i + j]:
                            match = False
                            break
                    if j == len(target) - 1 and match == None:
                        match = True
                        break

                if match == True:
                    # if match and 'appear-only', keep this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and True
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and False
                elif match == False:
                    # if not match and 'appear-only', discard this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and False
                    # if not match and 'non-appear-only', keep this phrase
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and True

                if not keep:
                    continue

                present_targets.append(target)

            number_groundtruth += len(targets)
            number_present_groundtruth += len(present_targets)

        print('number_groundtruth=' + str(number_groundtruth))
        print('number_present_groundtruth=' + str(number_present_groundtruth))
        '''
Пример #9
0
word2idx['<eol>'] = 0
word2idx['<unk>'] = 1
idx2word = {word2idx[w]: w for w in word2idx}
voc = ['<eol>', '<unk>'] + voc

# word2idx['X']      = len(voc)
# idx2word[len(voc)] = 'X'
# voc               += ['X']
#
# word2idx['Y']      = len(voc)
# idx2word[len(voc)] = 'Y'
# voc               += ['Y']
# print word2idx['X'], word2idx['Y']

# load the dataset
Rules, _ = deserialize_from_file(
    '/home/thoma/Work/Dial-DRL/dataset/rules.rnd.n10k.pkl')
num = 200
repeats = 100
maxleg = 15
Lmax = len(idx2word)
rules = dict(source=Rules['source'][:num], target=Rules['target'][:num])


def ftr(v):
    if v < 10:
        return '00' + str(v)
    elif v < 100:
        return '0' + str(v)
    else:
        return str(v)
Пример #10
0
    for i in range(len(source_text) - 1):
        for j in range(i + 1, len(source_text)):
            if j - i > max_len:
                continue
            if j - i == 1 and (source_text[i:j] == '<digit>'
                               or len(source_text[i:j][0]) == 1):
                continue
            tagseq = ''.join(source_postag[i:j])
            if re.match(np_regex, tagseq):
                np_list.append((source_text[i:j], source_postag[i:j]))

    print('Text: \t\t %s' % str(source_text))
    print('None Phrases:[%d] \n\t\t\t%s' %
          (len(np_list),
           str('\n\t\t\t'.join(
               [str(p[0]) + '[' + str(p[1]) + ']' for p in np_list]))))

    return np_list


if __name__ == '__main__':
    config = setup_keyphrase_all()
    test_set = db.deserialize_from_file(config['path'] +
                                        '/dataset/keyphrase/' +
                                        config['data_process_name'] +
                                        'semeval.testing.pkl')
    for s_index, s_str, s_tag in zip(test_set['source'],
                                     test_set['source_str'],
                                     [[s[1] for s in d]
                                      for d in test_set['tagged_source']]):
        get_none_phrases(s_str, s_tag, config['max_len'])
                              config['task_name'], config['timemark']))

    n_rng = np.random.RandomState(config['seed'])
    np.random.seed(config['seed'])
    rng = RandomStreams(n_rng.randint(2**30))

    logger.info('*' * 20 + '  config information  ' + '*' * 20)
    # print config information
    for k, v in config.items():
        logger.info("\t\t\t\t%s : %s" % (k, v))
    logger.info('*' * 50)

    # data is too large to dump into file, so has to load from raw dataset directly
    # train_set, test_set, idx2word, word2idx = keyphrase_dataset.load_data_and_dict(config['training_dataset'], config['testing_dataset'])

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = keyphrase_test_dataset.load_additional_testing_data(
        ['inspec'], idx2word, word2idx, config, postagging=False)

    logger.info('#(training paper)=%d' % len(train_set['source']))
    logger.info('#(training keyphrase)=%d' %
                sum([len(t) for t in train_set['target']]))
    logger.info(
        '#(testing paper)=%d' %
        sum([len(test_set['target']) for test_set in test_sets.values()]))

    logger.info('Load data done.')

    if config['voc_size'] == -1:  # not use unk
        config['enc_voc_size'] = max(list(zip(*word2idx.items()))[1]) + 1
        config['dec_voc_size'] = config['enc_voc_size']
                r_array.append(0)
                # print(entry[0], entry[0], entry[1])
        ndcg = ndcg_at_k(r_array, k)
        # print(r_array)
        # print(ndcg)
        ndcg_total += ndcg
    return float(ndcg_total) / len(testing_data)


if __name__ == '__main__':
    config = setup_keyphrase_all()  # load settings.

    training_data_dict, testing_data_list = load_evaluation_data(
        config['path'] + '/dataset/textbook_linking/ir/')

    docs = deserialize_from_file(config['path'] +
                                 '/dataset/textbook_linking/docs.pkl')

    iir_docs = [d for d in docs if d['name'].startswith('iir')]
    mir_docs = [
        d for d in docs
        if d['name'].startswith('mir') and d['name'] != 'mir_10_5_2'
    ]

    mir_name_map = load_mir_names()

    for d in mir_docs:
        # print('%s -> %s' % (d['name'], mir_name_map[d['name']]))
        d['name'] = mir_name_map[d['name']]

    similarity_matrix = {}
    encoding_name = 'backward'
Пример #13
0
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup()  # load settings.
for w in config:
    print '{0}={1}'.format(w, config[w])

logger = init_logging(config['path_log'] +
                      '/emolga.RHM.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2**30))

logger.info('Start!')

# load the dataset and build a fuel-dataset.
idx2word, word2idx = deserialize_from_file(config['vocabulary_set'])
config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
logger.info(
    'build dataset done. vocabulary size = {0}/ batch size = {1}'.format(
        config['dec_voc_size'], config['batch_size']))

# training & valid & tesing set.
train_set, train_size = build_fuel(deserialize_from_file(config['dataset']))
valid_set, valid_size = build_fuel(
    deserialize_from_file(config['dataset_test']))  # use test set for a try

# weiget save.
savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark)

# build the agent
Пример #14
0
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
from keyphrase.config import *

config = setup_keyphrase_all  # setup_keyphrase_all_testing

__author__ = "Rui Meng"
__email__ = "*****@*****.**"

if __name__ == '__main__':
    config = setup_keyphrase_all()  # load settings.

    loader = testing_data_loader('irbooks',
                                 kwargs=dict(basedir=config['path']))
    docs = loader.get_docs(return_dict=True)

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = load_additional_testing_data(config['testing_datasets'],
                                             idx2word,
                                             word2idx,
                                             config,
                                             postagging=False,
                                             process_type=2)

    test_set, test_s_list, test_t_list, test_s_o_list, test_t_o_list, input_encodings, predictions, scores, output_encodings, idx2word \
        = deserialize_from_file(config['predict_path'] + 'predict.{0}.{1}.pkl'.format(config['predict_type'], 'irbooks'))

    do_stem = False

    # Evaluation
    outs, overall_score = keyphrase_utils.evaluate_multiple(
        config,
Пример #15
0
    return logging


# prepare logging.
config  = setup()   # load settings.
for w in config:
    print '{0}={1}'.format(w, config[w])
tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
logger  = init_logging(config['path_log'] + '/experiments.LCSTS.Eval.id={}.log'.format(tmark))
n_rng   = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng     = RandomStreams(n_rng.randint(2 ** 30))
logger.info('Start!')

segment = config['segment']
word_set, char_set, word_voc, char_voc = deserialize_from_file('./dataset/lcsts_evaluate_data.pkl')

if segment:
    eval_set           = word_set
    word2idx, idx2word = word_voc
else:
    eval_set           = char_set
    word2idx, idx2word = char_voc

if config['voc_size'] == -1:   # not use unk
    config['enc_voc_size'] = len(word2idx)
    config['dec_voc_size'] = config['enc_voc_size']
else:
    config['enc_voc_size'] = config['voc_size']
    config['dec_voc_size'] = config['enc_voc_size']
Пример #16
0
# prepare logging.
tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
config = setup()  # load settings.
for w in config:
    print '{0}={1}'.format(w, config[w])

logger = init_logging(config['path_log'] +
                      '/experiments.Copy.id={}.log'.format(tmark))
n_rng = np.random.RandomState(config['seed'])
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2**30))
logger.info('Start!')

idx2word, word2idx, idx2word_o, word2idx_o \
        = deserialize_from_file(config['voc'])
idx2word_o[0] = '<eol>'
word2idx_o['<eol>'] = 0

source, target, origin = deserialize_from_file(config['dataset'])
samlpes = len(source)

config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
logger.info('build dataset done. ' + 'dataset size: {} ||'.format(samlpes) +
            'vocabulary size = {0}/ batch size = {1}'.format(
                config['dec_voc_size'], config['batch_size']))


def build_data(source, target):
    # create fuel dataset.
Пример #17
0
np.random.seed(config['seed'])
rng = RandomStreams(n_rng.randint(2**30))
logger.info('Start!')

# the vocabulary
tmp = [chr(x) for x in range(48, 58)]  # '1', ... , '9', '0'
voc = [
    tmp[a] + tmp[b] + tmp[c] for c in xrange(10) for b in xrange(10)
    for a in xrange(10)
]
word2idx = {voc[k]: k + 1 for k in xrange(len(voc))}
word2idx['<eol>'] = 0
idx2word = {word2idx[w]: w for w in word2idx}
voc = ['<eol>'] + voc

train_set, test_set = deserialize_from_file(config['dataset'])

config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
config['dec_voc_size'] = config['enc_voc_size']
samples = len(train_set['source'])

logger.info('build dataset done. ' + 'dataset size: {} ||'.format(samples) +
            'vocabulary size = {0}/ batch size = {1}'.format(
                config['dec_voc_size'], config['batch_size']))


def build_data(data):
    # create fuel dataset.
    dataset = datasets.IndexableDataset(indexables=OrderedDict([(
        'source', data['source']), (
            'target', data['target']), (