示例#1
0
jieba.load_userdict("../txt/dict.txt")
ID = data.Field(sequential=False, batch_first=True, use_vocab=False)
TEXT = data.Field(sequential=True,
                  lower=True,
                  eos_token='<EOS>',
                  init_token='<BOS>',
                  pad_token='<PAD>',
                  fix_length=None,
                  batch_first=True,
                  use_vocab=True,
                  tokenize=jieba.lcut)
LABEL = data.Field(sequential=False, batch_first=True, use_vocab=False)

train = data.TabularDataset(path='../../data/train.tsv',
                            format='tsv',
                            fields=[('Id', ID), ('Text1', TEXT),
                                    ('Text2', TEXT), ('Label', LABEL)],
                            skip_header=True)
valid = data.TabularDataset(path='../../data/valid.tsv',
                            format='tsv',
                            fields=[('Id', ID), ('Text1', TEXT),
                                    ('Text2', TEXT), ('Label', LABEL)],
                            skip_header=True)

TEXT.build_vocab(train, min_freq=3)
print('Building vocabulary Finished.')
word_matrix = datahelper.wordlist_to_matrix("../txt/embedding_300d.bin",
                                            TEXT.vocab.itos, device,
                                            embedding_dim)

train_iter = data.BucketIterator(
示例#2
0
if SUBMIT:
    infile = sys.argv[1]
    outfile = sys.argv[2]
else:
    infile = "../../data/train.tsv"
    outfile = "../../data/predict.tsv"

print('Reading data..')
jieba.load_userdict("../txt/dict.txt")
ID = data.Field(sequential=False, batch_first=True, use_vocab=False)
TEXT = data.Field(sequential=True, lower=True, eos_token='<EOS>', init_token='<BOS>',
                  pad_token='<PAD>', fix_length=None, batch_first=True, use_vocab=True, tokenize=jieba.lcut)
LABEL = data.Field(sequential=False, batch_first=True, use_vocab=False)

train = data.TabularDataset(
        path='../../data/train.tsv', format='tsv',
        fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)])

test = data.TabularDataset(
        path=infile, format='tsv',
    fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT)])

TEXT.build_vocab(train, min_freq=3)
print('Building vocabulary Finished.')
word_matrix = datahelper.wordlist_to_matrix("../txt/embedding_300d.bin", TEXT.vocab.itos, device, embedding_dim)

test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False)
test_dl = datahelper.BatchWrapper(test_iter, ["Id", "Text1", "Text2"])

MODEL = wide_deep(len(TEXT.vocab), embedding_dim, hidden_dim, batch_size, word_matrix, bidirectional=bidirectional)
MODEL.to(device)