def predict(text, name):
    en_text = clean(text, 'en')
    en_words = en_text.split()
    en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, keep_oov=True)
    en_sent = torch.LongTensor([en_pad_seq]).to(device)
    encode = map_item(name + '_encode', models)
    decode = map_item(name + '_decode', models)
    with torch.no_grad():
        encode.eval()
        state = encode(en_sent)
        decode.eval()
        return search(decode, state, en_sent, cand=3)
def predict(text, name, mode):
    text1 = clean(text)
    sent1 = ' '.join([text1, eos])
    seq1 = word2ind.texts_to_sequences([sent1])[0]
    pad_seq1 = pad_sequences([seq1],
                             maxlen=seq_len,
                             padding='pre',
                             truncating='pre')
    encode = map_item(name + '_encode', models)
    state = encode.predict(pad_seq1)
    decode = map_item(name + '_decode', models)
    func = map_item(mode, funcs)
    return func(decode, state, cand=3)
def load_model(name, embed_mat, device, mode):
    embed_mat = torch.Tensor(embed_mat)
    model = torch.load(map_item(name, paths), map_location=device)
    full_dict = model.state_dict()
    arch = map_item('_'.join([name, mode]), archs)
    part = arch(embed_mat).to(device)
    part_dict = part.state_dict()
    for key, val in full_dict.items():
        key = '.'.join(key.split('.')[1:])
        if key in part_dict:
            part_dict[key] = val
    part.load_state_dict(part_dict)
    return part
예제 #4
0
def define_encode(name, embed_mat, seq_len):
    vocab_num, embed_len = embed_mat.shape
    embed = Embedding(input_dim=vocab_num,
                      output_dim=embed_len,
                      input_length=seq_len,
                      name='embed')
    input = Input(shape=(seq_len, ))
    embed_input = embed(input)
    func = map_item(name, funcs)
    output = func(embed_input)
    model = Model(input, output)
    if __name__ == '__main__':
        plot_model(model, map_item(name + '_plot', paths), show_shapes=True)
    return model
예제 #5
0
def test(name, sent1s, labels):
    encode = map_item(name + '_encode', models)
    states = encode.predict(sent1s)
    decode = map_item(name + '_decode', models)
    probs = decode.predict([sent2s, states])
    len_sum, log_sum = [0] * 2
    for sent2, label, prob in zip(sent2s, labels, probs):
        bound = sum(sent2 > 0)
        len_sum = len_sum + bound
        sent_log = 0
        for i in range(bound):
            sent_log = sent_log + np.log(prob[i][label[i]])
        log_sum = log_sum + sent_log
    print('\n%s %s %.2f' % (name, 'perp:', np.power(2, -log_sum / len_sum)))
def load_model(name, embed_mat, pos_mat, att_mat, device, mode):
    embed_mat = torch.Tensor(embed_mat)
    model = torch.load(map_item(name, paths), map_location=device)
    full_dict = model.state_dict()
    arch = map_item('_'.join([name, mode]), archs)
    if mode == 'decode':
        part = arch(embed_mat, pos_mat, att_mat, head, stack).to(device)
    else:
        part = arch(embed_mat, pos_mat, head, stack).to(device)
    part_dict = part.state_dict()
    for part_key in part_dict.keys():
        full_key = '.'.join([mode, part_key])
        if full_key in full_dict:
            part_dict[part_key] = full_dict[full_key]
    part.load_state_dict(part_dict)
    return part
예제 #7
0
def rnn_predict(words, name):
    seq = word2ind.texts_to_sequences([' '.join(words)])[0]
    pad_seq = pad_sequences([seq], maxlen=seq_len)
    model = map_item(name, models)
    probs = model.predict(pad_seq)[0]
    bound = min(len(words), seq_len)
    return np.argmax(probs, axis=1)[-bound:]
예제 #8
0
def test(name, sents, labels):
    model = map_item(name, models)
    probs = model.predict(sents)
    preds = np.argmax(probs, axis=1)
    precs = precision_score(labels, preds, average=None)
    recs = recall_score(labels, preds, average=None)
    with open(map_item(name, paths), 'w') as f:
        f.write('label,prec,rec' + '\n')
        for i in range(class_num):
            f.write('%s,%.2f,%.2f\n' % (ind_labels[i], precs[i], recs[i]))
    f1 = f1_score(labels, preds, average='weighted')
    print('\n%s f1: %.2f - acc: %.2f\n' % (name, f1, accuracy_score(labels, preds)))
    if detail:
        for text, label, pred in zip(texts, labels, preds):
            if label != pred:
                print('{}: {} -> {}'.format(text, ind_labels[label], ind_labels[pred]))
예제 #9
0
def predict(text, name):
    text = clean(text)
    seq = word2ind.texts_to_sequences([text])[0]
    pad_seq = pad_sequences([seq], maxlen=seq_len)
    model = map_item(name, models)
    probs = model.predict(pad_seq)[0]
    sort_probs = sorted(probs, reverse=True)
    sort_inds = np.argsort(-probs)
    sort_preds = [ind_labels[ind] for ind in sort_inds]
    formats = list()
    for pred, prob in zip(sort_preds, sort_probs):
        formats.append('{} {:.3f}'.format(pred, prob))
    if name == 'adnn':
        core = map_item(name + '_core', models)
        atts = core.predict(pad_seq)[0]
        plot_att(text, atts[-len(text):])
    return ', '.join(formats)
예제 #10
0
def fit(name, max_epoch, en_embed_mat, zh_embed_mat, path_feats, detail):
    tensors = tensorize(load_feat(path_feats), device)
    bound = int(len(tensors) / 2)
    train_loader, dev_loader = get_loader(tensors[:bound]), get_loader(
        tensors[bound:])
    en_embed_mat, zh_embed_mat = torch.Tensor(en_embed_mat), torch.Tensor(
        zh_embed_mat)
    arch = map_item(name, archs)
    model = arch(en_embed_mat, zh_embed_mat, pos_mat, att_mat, head,
                 stack).to(device)
    loss_func = CrossEntropyLoss(ignore_index=0, reduction='sum')
    learn_rate, min_rate = 1e-3, 1e-5
    min_dev_loss = float('inf')
    trap_count, max_count = 0, 3
    print('\n{}'.format(model))
    train, epoch = True, 0
    while train and epoch < max_epoch:
        epoch = epoch + 1
        model.train()
        optim = Adam(model.parameters(), lr=learn_rate)
        start = time.time()
        train_loss, train_acc = batch_train(model, loss_func, optim,
                                            train_loader, detail)
        delta = time.time() - start
        with torch.no_grad():
            model.eval()
            dev_loss, dev_acc = batch_dev(model, loss_func, dev_loader)
        extra = ''
        if dev_loss < min_dev_loss:
            extra = ', val_loss reduce by {:.3f}'.format(min_dev_loss -
                                                         dev_loss)
            min_dev_loss = dev_loss
            trap_count = 0
            torch.save(model, map_item(name, paths))
        else:
            trap_count = trap_count + 1
            if trap_count > max_count:
                learn_rate = learn_rate / 10
                if learn_rate < min_rate:
                    extra = ', early stop'
                    train = False
                else:
                    extra = ', learn_rate divide by 10'
                    trap_count = 0
        epoch_print(epoch, delta, train_loss, train_acc, dev_loss, dev_acc,
                    extra)
예제 #11
0
def compile(name, embed_mat, seq_len):
    vocab_num, embed_len = embed_mat.shape
    embed = Embedding(input_dim=vocab_num, output_dim=embed_len,
                      weights=[embed_mat], input_length=seq_len, trainable=True)
    input1 = Input(shape=(seq_len,))
    input2 = Input(shape=(seq_len,))
    input3 = Input(shape=(seq_len,))
    embed_input1 = embed(input1)
    embed_input2 = embed(input2)
    embed_input3 = embed(input3)
    func = map_item(name, funcs)
    output = func(embed_input1, embed_input2, embed_input3)
    model = Model([input1, input2, input3], output)
    model.summary()
    plot_model(model, map_item(name + '_plot', paths), show_shapes=True)
    model.compile(loss=triple_loss, optimizer=Adam(lr=0.001), metrics=[triple_acc])
    return model
예제 #12
0
def fit(name, epoch, embed_mat, triples, margin):
    sents, pos_sents, neg_sents = triples
    margins = np.ones(len(sents)) * margin
    seq_len = len(sents[0])
    model = compile(name, embed_mat, seq_len)
    check_point = ModelCheckpoint(map_item(name, paths), monitor='val_loss', verbose=True, save_best_only=True)
    model.fit([sents, pos_sents, neg_sents], margins, batch_size=batch_size, epochs=epoch,
              verbose=True, callbacks=[check_point], validation_split=0.2)
예제 #13
0
def fit(path_train):
    with open(path_train, 'rb') as f:
        sents = pk.load(f)
    for name, func in funcs.items():
        model = func(sents, id2word=word2ind, num_topics=topic_num)
        topics = model.show_topics(num_words=key_num)
        save_dict(name, topics)
        with open(map_item(name, paths), 'wb') as f:
            pk.dump(model, f)
예제 #14
0
def ml_predict(text1, text2, name):
    text1, text2 = clean(text1), clean(text2)
    text = [text1, text2]
    sent = bow.transform(text)
    sent = svd.transform(sent)
    sent = merge(sent)
    model = map_item(name, models)
    prob = model.predict_proba(sent)[0][1]
    return '{:.3f}'.format(prob)
예제 #15
0
def compile(name, embed_mat, seq_len, class_num):
    vocab_num, embed_len = embed_mat.shape
    embed = Embedding(input_dim=vocab_num,
                      output_dim=embed_len,
                      weights=[embed_mat],
                      input_length=seq_len,
                      trainable=True)
    input = Input(shape=(seq_len, ))
    embed_input = embed(input)
    func = map_item(name, funcs)
    output = func(embed_input, class_num)
    model = Model(input, output)
    model.summary()
    plot_model(model, map_item(name + '_plot', paths), show_shapes=True)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'])
    return model
예제 #16
0
def xgb_fit(sents, labels):
    model = XGBC(max_depth=5,
                 learning_rate=0.1,
                 objective='multi:softmax',
                 n_estimators=100,
                 booster='gbtree')
    model.fit(sents, labels)
    with open(map_item('xgb', paths), 'wb') as f:
        pk.dump(model, f)
예제 #17
0
def nn_predict(text1, text2, name):
    text1, text2 = clean(text1), clean(text2)
    seq1 = word2ind.texts_to_sequences([text1])[0]
    seq2 = word2ind.texts_to_sequences([text2])[0]
    pad_seq1 = pad_sequences([seq1], maxlen=seq_len)
    pad_seq2 = pad_sequences([seq2], maxlen=seq_len)
    model = map_item(name, models)
    prob = model.predict([pad_seq1, pad_seq2])[0][0]
    return '{:.3f}'.format(prob)
예제 #18
0
def svm_fit(sents, labels):
    model = SVC(C=1.0,
                kernel='linear',
                max_iter=1000,
                probability=True,
                class_weight='balanced',
                verbose=True)
    model.fit(sents, labels)
    with open(map_item('svm', paths), 'wb') as f:
        pk.dump(model, f)
예제 #19
0
def test(name, sents, labels):
    sents, labels = tensorize([sents, labels], device)
    model = map_item(name, models)
    with torch.no_grad():
        model.eval()
        probs = F.softmax(model(sents), dim=1)
    preds = torch.max(probs, dim=1)[1]
    precs = precision_score(labels, preds, average=None)
    recs = recall_score(labels, preds, average=None)
    with open(map_item(name, paths), 'w') as f:
        f.write('label,prec,rec' + '\n')
        for i in range(class_num):
            f.write('%s,%.2f,%.2f\n' % (ind_labels[i], precs[i], recs[i]))
    f1 = f1_score(labels, preds, average='weighted')
    print('\n%s f1: %.2f - acc: %.2f\n' % (name, f1, accuracy_score(labels, preds)))
    if detail:
        for text, label, pred in zip(texts, labels.numpy(), preds.numpy()):
            if label != pred:
                print('{}: {} -> {}'.format(text, ind_labels[label], ind_labels[pred]))
예제 #20
0
def compile(name, embed_mat, seq_len, class_num):
    vocab_num, embed_len = embed_mat.shape
    embed = Embedding(input_dim=vocab_num,
                      output_dim=embed_len,
                      weights=[embed_mat],
                      input_length=seq_len,
                      trainable=True)
    input = Input(shape=(seq_len, ))
    embed_input = embed(input)
    func = map_item(name, funcs)
    crf = CRF(class_num)
    output = func(embed_input, crf)
    model = Model(input, output)
    model.summary()
    plot_model(model, map_item(name + '_plot', paths), show_shapes=True)
    model.compile(loss=crf.loss_function,
                  optimizer=Adam(lr=0.001),
                  metrics=[crf.accuracy])
    return model
예제 #21
0
def test_pair(name, pairs, flags, thre):
    model = map_item(name, models)
    sent1s, sent2s = pairs
    dists = model.predict([sent1s, sent2s])
    dists = np.reshape(dists, (1, -1))[0]
    preds = dists > thre
    print('\n%s %s %.2f\n' % (name, 'acc:', accuracy_score(flags, preds)))
    for flag, dist, text1, text2, pred in zip(flags, dists, text1s, text2s,
                                              preds):
        if flag != pred:
            print('{} {:.3f} {} | {}'.format(flag, dist, text1, text2))
예제 #22
0
def save_dict(name, topics):
    topic_pairs = list()
    for ind, all_str in topics:
        pair_strs = all_str.split(' + ')
        pairs = [pair_str.split('*') for pair_str in pair_strs]
        pair_dict = dict()
        for score, key in pairs:
            pair_dict[key[1:-1]] = float(score)
        topic_pairs.append(pair_dict)
    with open(map_item(name + '_dict', paths), 'w') as f:
        json.dump(topic_pairs, f, ensure_ascii=False, indent=4)
예제 #23
0
def test(name, sents, labels, thre):
    model = map_item(name, models)
    if name == 'svm' or name == 'xgb':
        probs = model.predict_proba(sents)[:, 1]
    else:
        sent1s, sent2s = sents
        probs = model.predict([sent1s, sent2s])
    preds = probs > thre
    f1 = f1_score(labels, preds)
    print('\n%s f1: %.2f - acc: %.2f' %
          (name, f1, accuracy_score(labels, preds)))
예제 #24
0
def cache(sents, labels):
    sent_mat, label_mat = split(sents, labels)
    for name, model in models.items():
        encode_mat = list()
        for sents in sent_mat:
            encode_mat.append(model.predict(sents))
        encode_mat, label_mat = clean(encode_mat, label_mat)
        core_sents, core_labels = merge(encode_mat, label_mat)
        path_cache = map_item(name + '_cache', paths)
        with open(path_cache, 'wb') as f:
            pk.dump((core_sents, core_labels), f)
def define_model(name, embed_mat, seq_len, class_num):
    vocab_num, embed_len = embed_mat.shape
    if name == 'cnn_crf':
        seq_len = seq_len + win_len - 1
    embed = Embedding(input_dim=vocab_num, output_dim=embed_len, input_length=seq_len)
    input = Input(shape=(seq_len,))
    embed_input = embed(input)
    func = map_item(name, funcs)
    crf = CRF(class_num)
    output = func(embed_input, crf)
    return Model(input, output)
예제 #26
0
def define_model(name, embed_mat, seq_len):
    vocab_num, embed_len = embed_mat.shape
    embed = Embedding(input_dim=vocab_num, output_dim=embed_len, input_length=seq_len)
    input1 = Input(shape=(seq_len,))
    input2 = Input(shape=(seq_len,))
    input3 = Input(shape=(seq_len,))
    embed_input1 = embed(input1)
    embed_input2 = embed(input2)
    embed_input3 = embed(input3)
    func = map_item(name, funcs)
    output = func(embed_input1, embed_input2, embed_input3)
    return Model([input1, input2, input3], output)
예제 #27
0
def compile(name, embed_mat, seq_len):
    vocab_num, embed_len = embed_mat.shape
    embed = Embedding(input_dim=vocab_num,
                      output_dim=embed_len,
                      weights=[embed_mat],
                      input_length=seq_len,
                      trainable=True,
                      name='embed')
    input1 = Input(shape=(seq_len, ))
    input2 = Input(shape=(seq_len, ))
    embed_input1 = embed(input1)
    embed_input2 = embed(input2)
    func = map_item(name, funcs)
    output = func(embed_input1, embed_input2)
    model = Model([input1, input2], output)
    model.summary()
    plot_model(model, map_item(name + '_plot', paths), show_shapes=True)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'])
    return model
예제 #28
0
def predict(text, name):
    en_text = clean(text, 'en')
    en_text = ' '.join([en_text, eos])
    en_words = en_text.split()
    en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, 'pre', keep_oov=True)
    en_sent = torch.LongTensor([en_pad_seq]).to(device)
    encode = map_item(name + '_encode', models)
    decode = map_item(name + '_decode', models)
    with torch.no_grad():
        encode.eval()
        state = encode(en_sent)
        decode.eval()
        zh_pred = search(decode, state, cand=3)
        if name == 'att' and __name__ == '__main__':
            zh_text = bos + zh_pred
            zh_pad_seq = sent2ind(zh_text, zh_word_inds, seq_len, 'post', keep_oov=True)
            zh_sent = torch.LongTensor([zh_pad_seq]).to(device)
            core = map_item(name + '_core', models)
            atts = core(zh_sent, state)[0]
            plot_att(en_words[:-1], zh_text[1:] + eos, atts)
        return zh_pred
예제 #29
0
def dnn_predict(words, name):
    seq = word2ind.texts_to_sequences([' '.join(words)])[0]
    trunc_wins = list()
    buf = [0] * int((win_len - 1) / 2)
    buf_seq = buf + seq + buf
    for u_bound in range(win_len, len(buf_seq) + 1):
        l_bound = u_bound - win_len
        trunc_wins.append(buf_seq[l_bound:u_bound])
    trunc_wins = np.array(trunc_wins)
    model = map_item(name, models)
    probs = model.predict(trunc_wins)
    return np.argmax(probs, axis=1)
예제 #30
0
def test(name, sents, labels, thre):
    sents, labels = tensorize([sents, labels], device)
    model = map_item(name, models)
    with torch.no_grad():
        model.eval()
        probs = torch.sigmoid(model(sents))
    probs = torch.squeeze(probs, dim=-1)
    mask = labels > -1
    mask_probs, mask_labels = probs.masked_select(mask), labels.masked_select(mask)
    mask_preds = mask_probs > thre
    f1 = f1_score(mask_labels, mask_preds)
    print('\n%s f1: %.2f - acc: %.2f' % (name, f1, accuracy_score(mask_labels, mask_preds)))