예제 #1
0
def predict(prices, check_ml):
	sent_analysis = SentimentAnalysis()
	model, _, _ = sent_analysis.get_model()
	model.load_state_dict(torch.load('server/models/portfolio/rnn_20.pkl'))
	model.eval()
	preds = model(prices)
	predict_loader = TextClassDataLoader(preds, batch_size = 1, predict=True, check_ml = check_ml)
	_, preds = predict_loader.predict_batches
	return preds
예제 #2
0
def get_sentence():
    train_loader = TextClassDataLoader('data/input.txt',
                                       d_word_index,
                                       batch_size=1)
    arr = []
    for i, (seq, target, seq_lengths) in enumerate(train_loader):
        print(seq)
        print(target)
        print(seq_lengths)
        output = model(seq, seq_lengths)
        arr = output[0].data.numpy().tolist()
        print(arr)
        print(arr.index(max(arr)))
    return arr.index(max(arr))
예제 #3
0
	def get_trainer(self):
		print('Creating dataloaders...')
		train_loader = TextClassDataLoader('server/models/portfolio/data/test.csv', batch_size = self.batch_size)
		val_loader = TextClassDataLoader('server/models/portfolio/data/test.csv', batch_size = self.batch_size)
		return train_loader, val_loader
예제 #4
0
def run_model(domain):
    # create vocab
    print("===> creating vocabs for domain..." + domain)
    end = time.time()
    domain_d = 'reviews/leave_out_' + domain
    lda_model = models.LdaModel.load(domain_d + '/lda_model/lda_' + domain)
    lda_dict = gensim.corpora.Dictionary.load(domain_d + '/lda_model/dict_' +
                                              domain)
    print(domain_d)
    v_builder = VocabBuilder(path_file=domain_d + '/train.csv',
                             min_sample=args.min_samples)
    d_word_index = v_builder.get_word_index()
    vocab_size = len(d_word_index)
    word2id = {v: k for k, v in d_word_index.iteritems()}
    #print (word2id)
    embeddings = load_glove_embeddings(
        '/home/DebanjanChaudhuri/topic_lstm_torch/word_vecs/glove.6B.50d.txt',
        d_word_index)
    if not os.path.exists('gen_' + domain):
        os.mkdir('gen_' + domain)

    joblib.dump(d_word_index,
                'gen_' + domain + '/d_word_index.pkl',
                compress=3)
    print('===> vocab creating: {t:.3f}'.format(t=time.time() - end))

    # create trainer
    print("===> creating dataloaders ...")
    end = time.time()
    train_loader = TextClassDataLoader(domain_d + '/train.csv',
                                       d_word_index,
                                       batch_size=args.batch_size)
    val_loader = TextClassDataLoader(domain_d + '/val.csv',
                                     d_word_index,
                                     batch_size=args.batch_size)
    test_loader = TextClassDataLoader(domain_d + '/test.csv',
                                      d_word_index,
                                      batch_size=args.batch_size)
    print('===> Dataloader creating: {t:.3f}'.format(t=time.time() - end))

    # create model
    print("===> creating rnn model ...")
    if args.mit_topic:
        print("with topic vectors.")
        model = RNNTopic(vocab_size=vocab_size,
                         embed_size=args.embedding_size,
                         num_output=args.classes,
                         topic_size=50,
                         hidden_size=args.hidden_size,
                         num_layers=args.layers,
                         batch_first=True,
                         use_gpu=args.cuda,
                         embeddings=embeddings,
                         emb_drop=args.emb_drop,
                         fc_size=args.fc_layer)
    else:
        model = RNN(vocab_size=vocab_size,
                    embed_size=args.embedding_size,
                    num_output=args.classes,
                    hidden_size=args.hidden_size,
                    num_layers=args.layers,
                    batch_first=True,
                    use_gpu=args.cuda,
                    embeddings=embeddings,
                    emb_drop=args.emb_drop,
                    fc_size=args.fc_layer)

    print(model)

    # optimizer and loss
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss()
    print(optimizer)
    print(criterion)

    if args.cuda:
        torch.backends.cudnn.enabled = True
        cudnn.benchmark = True
        model.cuda()
        criterion = criterion.cuda()

    #List for checking early stopping
    val_acc = []
    for epoch in range(1, args.epochs + 1):

        adjust_learning_rate(args.lr, optimizer, epoch)
        train(train_loader, model, criterion, optimizer, epoch, lda_model,
              lda_dict, word2id)
        print("getting performance on validation set!")
        v_acc = validate(val_loader, model, criterion, lda_model, lda_dict,
                         word2id)
        print(len(val_acc), args.early_stopping)
        #if len(val_acc) > args.early_stopping:
        print("checking early stopping.")
        if earlystop(val_acc, v_acc):
            print("Early stopping!")
            break
        val_acc.append(v_acc)

        # save current model
        if epoch % args.save_freq == 0:
            name_model = 'rnn_{}.pkl'.format(epoch)
            path_save_model = os.path.join('gen_' + domain + '/', name_model)
            joblib.dump(model.float(), path_save_model, compress=2)
    print("Results on test set for leave-out-domain!" + domain)
    test_acc = test(test_loader, model, criterion, lda_model, lda_dict,
                    word2id)
    return test_acc
parser.add_argument('--clip',
                    type=float,
                    default=0.25,
                    help='gradient clipping')
args = parser.parse_args()

gen = args.gen + str(args.embedding_size) + 'v'
# load vocab
d_word_index, model = None, None
if os.path.exists(gen + '/d_word_index.pkl'):
    d_word_index = joblib.load(gen + '/d_word_index.pkl')

# create tester
print("===> creating dataloaders ...")
val_loader = TextClassDataLoader('data/test_pdtb.tsv',
                                 d_word_index,
                                 batch_size=args.batch_size)

# load model
if os.path.exists(gen + '/rnn_50.pkl'):
    model = joblib.load(gen + '/rnn_50.pkl')

# optimizer and loss
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                             lr=args.lr,
                             weight_decay=args.weight_decay)

criterion = nn.CrossEntropyLoss()
print(optimizer)
print(criterion)
예제 #6
0
else:
    v_builder = VocabBuilder(path_file='data/train1.csv')
    d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)

if not os.path.exists('gen'):
    os.mkdir('gen')
joblib.dump(d_word_index, 'gen/d_word_index.pkl', compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
print("===> creating dataloaders ...")
end = time.time()
train_loader = TextClassDataLoader('data/train1.csv',
                                   d_word_index,
                                   batch_size=args.batch_size)
val_loader = TextClassDataLoader('data/test1.csv',
                                 d_word_index,
                                 batch_size=args.batch_size)
print('===> dataloader creatin: {t:.3f}'.format(t=time.time() - end))

# create model
print("===> creating rnn model ...")
vocab_size = len(d_word_index)
model = RNN(vocab_size=vocab_size,
            embed_size=args.embedding_size,
            num_output=args.classes,
            rnn_model=args.rnn,
            use_last=(not args.mean_seq),
            hidden_size=args.hidden_size,
try:
    os.makedirs('models/' + args.name)
except FileExistsError:
    pass
with codecs.open('models/' + args.name + '/classify_stat.pkl', 'wb') as fout:
    pickle.dump(d_word_index, fout)
# joblib.dump(d_word_index, 'models/' + args.name + '/d_word_index.pkl', compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
print("===> creating dataloaders ...")
end = time.time()
train_loader = TextClassDataLoader(args.train,
                                   d_word_index,
                                   batch_size=args.batch_size)
val_loader = TextClassDataLoader(args.test,
                                 d_word_index,
                                 batch_size=args.batch_size)
print('===> dataloader creatin: {t:.3f}'.format(t=time.time() - end))

# create model
print("===> creating rnn model ...")
vocab_size = len(d_word_index)
model = RNN(vocab_size=vocab_size,
            embed_size=args.embedding_size,
            num_output=args.classes,
            rnn_model=args.rnn,
            use_last=(not args.mean_seq),
            hidden_size=args.hidden_size,
model_dir = os.path.join('checkpoints', args.model)
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)
joblib.dump(d_word_index,
            os.path.join(model_dir, 'd_word_index.pkl'),
            compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
print("===> creating dataloaders ...")
end = time.time()
if not args.multi_label:
    train_loader = TextClassDataLoader(train_file,
                                       d_word_index,
                                       batch_size=args.batch_size)
    val_loader = TextClassDataLoader(val_file,
                                     d_word_index,
                                     batch_size=args.batch_size)
    test_loader = TextClassDataLoader(test_file,
                                      d_word_index,
                                      batch_size=args.batch_size)
else:
    train_loader = TextClassDataLoader_multi(train_file,
                                             d_word_index,
                                             batch_size=args.batch_size)
    val_loader = TextClassDataLoader_multi(val_file,
                                           d_word_index,
                                           batch_size=args.batch_size)
    test_loader = TextClassDataLoader_multi(test_file,
예제 #9
0
    args.embedding_size = embed.size(1)
else:
    v_builder = VocabBuilder(path_file='data/train.tsv')
    d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)
if not os.path.exists('gen'):
    os.mkdir('gen')
joblib.dump(d_word_index, 'gen/d_word_index.pkl', compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
print("===> creating dataloaders ...")
end = time.time()
train_loader = TextClassDataLoader('data/aminer_train.tsv',
                                   d_word_index,
                                   batch_size=args.batch_size)
val_loader = TextClassDataLoader('data/aminer_test.tsv',
                                 d_word_index,
                                 batch_size=args.batch_size)
print('===> dataloader creatin: {t:.3f}'.format(t=time.time() - end))

# create model
print("===> creating rnn model ...")
vocab_size = len(d_word_index)
model = RNN(vocab_size=vocab_size,
            embed_size=args.embedding_size,
            num_output=args.classes,
            rnn_model=args.rnn,
            use_last=(not args.mean_seq),
            hidden_size=args.hidden_size,