jieba.load_userdict("../txt/dict.txt") ID = data.Field(sequential=False, batch_first=True, use_vocab=False) TEXT = data.Field(sequential=True, lower=True, eos_token='<EOS>', init_token='<BOS>', pad_token='<PAD>', fix_length=None, batch_first=True, use_vocab=True, tokenize=jieba.lcut) LABEL = data.Field(sequential=False, batch_first=True, use_vocab=False) train = data.TabularDataset(path='../../data/train.tsv', format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)], skip_header=True) valid = data.TabularDataset(path='../../data/valid.tsv', format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)], skip_header=True) TEXT.build_vocab(train, min_freq=3) print('Building vocabulary Finished.') word_matrix = datahelper.wordlist_to_matrix("../txt/embedding_300d.bin", TEXT.vocab.itos, device, embedding_dim) train_iter = data.BucketIterator(
if SUBMIT: infile = sys.argv[1] outfile = sys.argv[2] else: infile = "../../data/train.tsv" outfile = "../../data/predict.tsv" print('Reading data..') jieba.load_userdict("../txt/dict.txt") ID = data.Field(sequential=False, batch_first=True, use_vocab=False) TEXT = data.Field(sequential=True, lower=True, eos_token='<EOS>', init_token='<BOS>', pad_token='<PAD>', fix_length=None, batch_first=True, use_vocab=True, tokenize=jieba.lcut) LABEL = data.Field(sequential=False, batch_first=True, use_vocab=False) train = data.TabularDataset( path='../../data/train.tsv', format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)]) test = data.TabularDataset( path=infile, format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT)]) TEXT.build_vocab(train, min_freq=3) print('Building vocabulary Finished.') word_matrix = datahelper.wordlist_to_matrix("../txt/embedding_300d.bin", TEXT.vocab.itos, device, embedding_dim) test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False) test_dl = datahelper.BatchWrapper(test_iter, ["Id", "Text1", "Text2"]) MODEL = wide_deep(len(TEXT.vocab), embedding_dim, hidden_dim, batch_size, word_matrix, bidirectional=bidirectional) MODEL.to(device)