Пример #1
0
def annotate_data():
    raw_data = get_raw_data()
    parsed_data = get_parsed_data(raw_data)
    # combining text at nodes occurs within these functions
    sst_trees = get_sst_trees(raw_data)
    dep_trees = get_dep_trees(parsed_data)
    # use compare and annotate to annotate
    for dataset in dep_trees.keys():
        dep_set = dep_trees[dataset]
        sst_set = sst_trees[dataset]
        for i in range(len(dep_set)):
            compare_and_annotate(sst_set[i], dep_set[i])
            # report every so often to check integrity
            if i % 100 == 0:
                print('ORIGINAL')
                for node in sst_set[i].node_list:
                    print('%s\t%s\t%s' % (node.id, node.tag, node.text_at_node))
                print('DEP')
                for node in dep_set[i].node_list:
                    print('%s\t%s\t%s' % (
                        node.id, node.annotation, node.text_at_node))
    # save a pickle
    pickling.save(dep_trees, glovar.PKL_DIR, 'annotated_dep_trees.pkl')
    return dep_trees
Пример #2
0
"""For pre-processing the data."""
from ext import vocab_emb, pickling
from data import sst, nli
import glovar
import os


if not os.path.exists(glovar.PKL_DIR):
    os.makedirs(glovar.PKL_DIR)
if not os.path.exists(glovar.CKPT_DIR):
    os.makedirs(glovar.CKPT_DIR)


# Create the vocab dictionary
print('Creating vocab dict...')
#sst_text = sst.get_text()
nli_text = nli.get_text()
#all_text = ' '.join([sst_text, nli_text])
all_text = ' '.join([nli_text])
vocab_dict, _ = vocab_emb.create_vocab_dict(all_text)
pickling.save(vocab_dict, glovar.PKL_DIR, 'vocab_dict.pkl')
print('Success.')


# Create GloVe embeddings
print('Creating GloVe embeddings...')
embedding_mat = vocab_emb.create_embeddings(vocab_dict, 300, glovar.GLOVE_DIR)
pickling.save(embedding_mat, glovar.PKL_DIR, 'glove_embeddings.pkl')
print('Success.')
Пример #3
0
 def save(self):
     pickling.save(self, glovar.PKL_DIR, 'history_%s.pkl' % self.name)