Пример #1
0
set_tf_log_level(args.tf_ll)

feature_desc = {
    'word': {
        'vectorizer':
        baseline.Token1DVectorizer(mxlen=100, transform_fn=baseline.lowercase),
        'embed': {
            'file': args.embeddings,
            'type': 'default',
            'unif': 0.25
        }
    }
}

vectorizers = {k: v['vectorizer'] for k, v in feature_desc.items()}
reader = baseline.TSVSeqLabelReader(
    vectorizers, clean_fn=baseline.TSVSeqLabelReader.do_clean)

train_file = args.train
valid_file = args.valid
test_file = args.test

# This builds a set of counters
vocabs, labels = reader.build_vocab([train_file, valid_file, test_file])

# This builds a set of embeddings objects, these are typically not DL-specific
# but if they happen to be addons, they can be
embeddings = dict()
for k, v in feature_desc.items():
    embed_config = v['embed']
    embeddings_for_k = baseline.embeddings.load_embeddings(
        'word',
Пример #2
0
    batch_y = []
    dsz = embeddings.get_dsz()
    ts = reader.load(file, vocabs={'word': embeddings.vocab}, batchsz=batchsz)
    pg = bl.create_progress_bar(len(ts))
    for batch in pg(ts):
        x = batch['word']
        B, T = x.shape
        flat_x = x.reshape(B * T)
        dense = embeddings.weights[flat_x]
        dense = dense.reshape(B, T, dsz)
        batch_x.append(dense)
        batch_y.append(batch['y'])
    return np.stack(batch_x), np.stack(batch_y)


reader = bl.TSVSeqLabelReader(VECTORIZERS,
                              clean_fn=bl.TSVSeqLabelReader.do_clean)

train_file = os.path.join(BP, TRAIN)
valid_file = os.path.join(BP, VALID)
test_file = os.path.join(BP, TEST)

# This builds a set of counters
vocabs, labels = reader.build_vocab([train_file, valid_file, test_file])
print('Writing {}'.format(LABELS))
bl.write_json(labels, LABELS)
# This builds a set of embeddings objects, these are typically not DL-specific
# but if they happen to be addons, they can be
embeddings = bl.PretrainedEmbeddingsModel(W2V_GN_300,
                                          known_vocab=vocabs['word'],
                                          embed_type='default',
                                          unif=0.)