예제 #1
0
def create_word_index(model,
                      glove_path,
                      embedding_size,
                      min_samples,
                      pdtb_category=''):
    if os.path.exists(glove_path):
        v_builder = GloveVocabBuilder(path_glove=glove_path)
        d_word_index, embed = v_builder.get_word_index()
        ed_size = embed.size(1)
        is_glove = True
    else:
        v_builder = VocabBuilder(path_file=PROCESSED_DATA_PATH + '/train.tsv')
        d_word_index, embed = v_builder.get_word_index(min_sample=min_samples)
        ed_size = embedding_size
        is_glove = False

    results_path = get_results_path(model, is_glove, ed_size, pdtb_category)
    joblib.dump(d_word_index, results_path + '/d_word_index.pkl', compress=3)

    return (v_builder, d_word_index, embed, ed_size, results_path)
예제 #2
0
parser.add_argument('--cuda', default=False, action='store_true', help='use cuda')
parser.add_argument('--fasttext-tensor', default='data/fasttext.pt', help='path to fasttext embeddings tensor')
parser.add_argument('--fasttext-voc', default='data/fasttext_voc.pkl', help='path to fasttext embeddings tensor')
parser.add_argument('--train-path', default="data/en-ud-train.csv", help='path to train data csv')
parser.add_argument('--dev-path', default="data/en-ud-dev.csv", help='path to dev data csv')
parser.add_argument('--clip', type=float, default=5, help='gradient clipping')
args = parser.parse_args()

print()
# create vocab
print("===> creating word, tag, char, dep_rel vocabs and loading pre-trained embeddings ...")

start = time.time()
fasttext_embed = torch.load(args.fasttext_tensor)
fasttext_word_to_index = pickle.load(open(args.fasttext_voc, 'rb'))
w_builder = VocabBuilder(path_file=args.train_path)
word_to_index, words = w_builder.get_word_index(min_sample=args.min_samples)
char_builder  = CharBuilder(path_file=args.train_path)
char_to_index, chars = char_builder.get_char_index()
pos_builder = TagBuilder(args.train_path,"POS")
pos_to_index, pos_tags = pos_builder.get_tag_index_padded()
xpos_builder = TagBuilder(args.train_path,"XPOS")
xpos_to_index, xpos_tags = xpos_builder.get_tag_index_padded()
rel_builder = TagBuilder(args.train_path,"Drel")
rel_to_index, rel_tags = rel_builder.get_tag_index()

if not os.path.exists('gen'):
    os.mkdir('gen')
with open("gen/parser_model.params", 'wb') as paramsfp:
    pickle.dump((word_to_index,char_to_index,pos_to_index,xpos_to_index,rel_to_index), paramsfp)
print('===> vocab creating in: {t:.3f}s'.format(t=time.time()-start))
예제 #3
0
parser.add_argument('--clip',
                    type=float,
                    default=0.25,
                    help='gradient clipping')
args = parser.parse_args()

# create vocab
print("===> creating vocabs ...")
end = time.time()
v_builder, d_word_index, embed = None, None, None
if os.path.exists(args.glove):
    v_builder = GloveVocabBuilder(path_glove=args.glove)
    d_word_index, embed = v_builder.get_word_index()
    args.embedding_size = embed.size(1)
else:
    v_builder = VocabBuilder(path_file='data/train1.csv')
    d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)

if not os.path.exists('gen'):
    os.mkdir('gen')
joblib.dump(d_word_index, 'gen/d_word_index.pkl', compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
print("===> creating dataloaders ...")
end = time.time()
train_loader = TextClassDataLoader('data/train1.csv',
                                   d_word_index,
                                   batch_size=args.batch_size)
예제 #4
0
def run_model(domain):
    # create vocab
    print("===> creating vocabs for domain..." + domain)
    end = time.time()
    domain_d = 'reviews/leave_out_' + domain
    lda_model = models.LdaModel.load(domain_d + '/lda_model/lda_' + domain)
    lda_dict = gensim.corpora.Dictionary.load(domain_d + '/lda_model/dict_' +
                                              domain)
    print(domain_d)
    v_builder = VocabBuilder(path_file=domain_d + '/train.csv',
                             min_sample=args.min_samples)
    d_word_index = v_builder.get_word_index()
    vocab_size = len(d_word_index)
    word2id = {v: k for k, v in d_word_index.iteritems()}
    #print (word2id)
    embeddings = load_glove_embeddings(
        '/home/DebanjanChaudhuri/topic_lstm_torch/word_vecs/glove.6B.50d.txt',
        d_word_index)
    if not os.path.exists('gen_' + domain):
        os.mkdir('gen_' + domain)

    joblib.dump(d_word_index,
                'gen_' + domain + '/d_word_index.pkl',
                compress=3)
    print('===> vocab creating: {t:.3f}'.format(t=time.time() - end))

    # create trainer
    print("===> creating dataloaders ...")
    end = time.time()
    train_loader = TextClassDataLoader(domain_d + '/train.csv',
                                       d_word_index,
                                       batch_size=args.batch_size)
    val_loader = TextClassDataLoader(domain_d + '/val.csv',
                                     d_word_index,
                                     batch_size=args.batch_size)
    test_loader = TextClassDataLoader(domain_d + '/test.csv',
                                      d_word_index,
                                      batch_size=args.batch_size)
    print('===> Dataloader creating: {t:.3f}'.format(t=time.time() - end))

    # create model
    print("===> creating rnn model ...")
    if args.mit_topic:
        print("with topic vectors.")
        model = RNNTopic(vocab_size=vocab_size,
                         embed_size=args.embedding_size,
                         num_output=args.classes,
                         topic_size=50,
                         hidden_size=args.hidden_size,
                         num_layers=args.layers,
                         batch_first=True,
                         use_gpu=args.cuda,
                         embeddings=embeddings,
                         emb_drop=args.emb_drop,
                         fc_size=args.fc_layer)
    else:
        model = RNN(vocab_size=vocab_size,
                    embed_size=args.embedding_size,
                    num_output=args.classes,
                    hidden_size=args.hidden_size,
                    num_layers=args.layers,
                    batch_first=True,
                    use_gpu=args.cuda,
                    embeddings=embeddings,
                    emb_drop=args.emb_drop,
                    fc_size=args.fc_layer)

    print(model)

    # optimizer and loss
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss()
    print(optimizer)
    print(criterion)

    if args.cuda:
        torch.backends.cudnn.enabled = True
        cudnn.benchmark = True
        model.cuda()
        criterion = criterion.cuda()

    #List for checking early stopping
    val_acc = []
    for epoch in range(1, args.epochs + 1):

        adjust_learning_rate(args.lr, optimizer, epoch)
        train(train_loader, model, criterion, optimizer, epoch, lda_model,
              lda_dict, word2id)
        print("getting performance on validation set!")
        v_acc = validate(val_loader, model, criterion, lda_model, lda_dict,
                         word2id)
        print(len(val_acc), args.early_stopping)
        #if len(val_acc) > args.early_stopping:
        print("checking early stopping.")
        if earlystop(val_acc, v_acc):
            print("Early stopping!")
            break
        val_acc.append(v_acc)

        # save current model
        if epoch % args.save_freq == 0:
            name_model = 'rnn_{}.pkl'.format(epoch)
            path_save_model = os.path.join('gen_' + domain + '/', name_model)
            joblib.dump(model.float(), path_save_model, compress=2)
    print("Results on test set for leave-out-domain!" + domain)
    test_acc = test(test_loader, model, criterion, lda_model, lda_dict,
                    word2id)
    return test_acc
예제 #5
0
# create vocab
print("===> creating vocabs ...")
end = datetime.datetime.now()

v_builder, d_word_index, embed = None, None, None
train_path = args.train_data
test_path = args.test_data
dic_name = os.path.join('gen', args.weight_name + '.pkl')
weight_save_model = os.path.join('gen', args.weight_name)

try:
    d_word_index = joblib.load(dic_name)
    embed = torch.load(weight_save_model)
    print('load existing embedding vectors, name is ', args.weight_name)
except:
    v_builder = VocabBuilder(path_file=train_path)
    d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)
    print('create new embedding vectors')

if not os.path.exists('gen'):
    os.mkdir('gen')
joblib.dump(d_word_index, dic_name, compress=3)

end = datetime.datetime.now()
train_loader = Word2vecLoader(train_path,
                              d_word_index,
                              batch_size=args.batch_size)
val_loader = Word2vecLoader(test_path,
                            d_word_index,
                            batch_size=args.batch_size)
parser.add_argument('--clip',
                    type=float,
                    default=0.25,
                    help='gradient clipping')
args = parser.parse_args()

# create vocab
print("===> creating vocabs ...")
end = time.time()
v_builder, d_word_index, embed = None, None, None
if os.path.exists(args.glove):
    v_builder = GloveVocabBuilder(path_glove=args.glove)
    d_word_index, embed = v_builder.get_word_index()
    args.embedding_size = embed.size(1)
else:
    v_builder = VocabBuilder(path_file=args.train)
    d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)
if not os.path.exists('gen'):
    os.mkdir('gen')
try:
    os.makedirs('models/' + args.name)
except FileExistsError:
    pass
with codecs.open('models/' + args.name + '/classify_stat.pkl', 'wb') as fout:
    pickle.dump(d_word_index, fout)
# joblib.dump(d_word_index, 'models/' + args.name + '/d_word_index.pkl', compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
예제 #7
0
#!/usr/bin/env python
# encoding: utf-8

from vocab import VocabBuilder
from dataloader import DataLoader
from model import RNN

filepath = "./dataset/dataset.csv"
vocab_obj = VocabBuilder(filepath=filepath)

word_to_index = vocab_obj.word_to_index
label_to_index = vocab_obj.label_to_index

index_to_label = {}
for label, index in label_to_index.items():
    index_to_label[index] = label

loader = DataLoader(filepath=filepath,
                    word_to_index=word_to_index,
                    label_to_index=label_to_index,
                    batch_size=128)

vocab_size = len(word_to_index)
embedding_size = 128
num_output = len(label_to_index)

model = RNN(vocab_size=vocab_size,
            embed_size=embedding_size,
            num_output=num_output,
            rnn_model="LSTM",
            use_last=True,
    val_file = os.path.join(args.data, 'val.csv')
    test_file = os.path.join(args.data, 'test.csv')
else:
    train_file = os.path.join(args.data, 'trainval.tsv')
    val_file = os.path.join(args.data, 'val.tsv')
    test_file = os.path.join(args.data, 'test.tsv')

v_builder, d_word_index, embed = None, None, None
#if os.path.exists(args.glove):
if args.use_glove:
    glove_file = 'glove/glove.6B.{}d.txt'.format(args.glove)
    v_builder = GloveVocabBuilder(path_glove=glove_file)
    d_word_index, embed = v_builder.get_word_index()
    args.embedding_size = embed.size(1)
else:
    v_builder = VocabBuilder(path_file=train_file)
    d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)

#d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)

model_dir = os.path.join('checkpoints', args.model)
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)
joblib.dump(d_word_index,
            os.path.join(model_dir, 'd_word_index.pkl'),
            compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
예제 #9
0
parser.add_argument('--clip',
                    type=float,
                    default=0.25,
                    help='gradient clipping')
args = parser.parse_args()

# create vocab
print("===> creating vocabs ...")
end = time.time()
v_builder, d_word_index, embed = None, None, None
if os.path.exists(args.glove):
    v_builder = GloveVocabBuilder(path_glove=args.glove)
    d_word_index, embed = v_builder.get_word_index()
    args.embedding_size = embed.size(1)
else:
    v_builder = VocabBuilder(path_file='data/train_pdtb.tsv')
    d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)

gen = args.gen + str(args.embedding_size) + 'v'
if not os.path.exists(gen):
    os.makedirs(gen)
joblib.dump(d_word_index, gen + '/d_word_index.pkl', compress=3)
print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end))

print('args: ', args)

# create trainer
print("===> creating dataloaders ...")
end = time.time()
train_loader = TextClassDataLoader('data/train_pdtb.tsv',
                                   d_word_index,