def vectorize(path_data, path_sent, path_label, mode):
    sents = flat_read(path_data, 'text')
    labels = flat_read(path_data, 'label')
    if mode == 'train':
        embed(sents, path_word2ind, path_word_vec, path_embed)
        label2ind(labels, path_label_ind)
    align(sents, labels, path_sent, path_label)
Exemplo n.º 2
0
def statistic(path_train):
    texts = flat_read(path_train, 'text')
    labels = flat_read(path_train, 'label')
    text_str = ''.join(texts)
    text_lens = [len(text) for text in texts]
    count(path_vocab_freq, text_str, 'vocab')
    count(path_len_freq, text_lens, 'text_len')
    count(path_label_freq, labels, 'label')
Exemplo n.º 3
0
def statistic(path_train):
    docs = flat_read(path_train, 'cut_doc')
    labels = flat_read(path_train, 'label')
    all_words = ' '.join(docs).split()
    doc_lens = [len(doc.split()) for doc in docs]
    count(path_vocab_freq, all_words, 'vocab')
    count(path_len_freq, doc_lens, 'doc_len')
    count(path_label_freq, labels, 'label')
def vectorize(path_data, path_sent, path_label, mode):
    texts = flat_read(path_data, 'text')
    sents = add_flag(texts)
    sent_words = [list(sent) for sent in sents]
    labels = flat_read(path_data, 'label')
    if mode == 'train':
        embed(sent_words, path_word_ind, path_word_vec, path_embed)
        label2ind(labels, path_label_ind)
    align(sent_words, labels, path_sent, path_label)
Exemplo n.º 5
0
def vectorize_triple(path_data, path_triple):
    anc_sents = flat_read(path_data, 'anc')
    pos_sents = flat_read(path_data, 'pos')
    neg_sents = flat_read(path_data, 'neg')
    anc_seqs, pos_seqs, neg_seqs = align(anc_sents), align(pos_sents), align(
        neg_sents)
    triples = (anc_seqs, pos_seqs, neg_seqs)
    with open(path_triple, 'wb') as f:
        pk.dump(triples, f)
def vectorize(path_data, path_sent, path_label, mode):
    sents = flat_read(path_data, 'text')
    labels = flat_read(path_data, 'label')
    if mode == 'train':
        embed(sents, path_word2ind, path_word_vec, path_embed)
    pad_seqs = align(sents)
    with open(path_sent, 'wb') as f:
        pk.dump(pad_seqs, f)
    with open(path_label, 'wb') as f:
        pk.dump(labels, f)
def statistic(path_train):
    texts = flat_read(path_train, 'text')
    labels = flat_read(path_train, 'label')
    text_str = ''.join(texts)
    text_lens = [len(text) for text in texts]
    count(path_vocab_freq, text_str, 'vocab')
    count(path_len_freq, text_lens, 'text_len')
    count(path_label_freq, labels, 'label')
    metric = int(len(texts) / np.median(text_lens))
    print('sent / word_per_sent: %d' % metric)
Exemplo n.º 8
0
def statistic(path_train):
    poets = flat_read(path_train, 'poet')
    titles = flat_read(path_train, 'title')
    texts = flat_read(path_train, 'text')
    text_str = ''.join(texts)
    text_lens = [len(text) for text in texts]
    count(path_poet_freq, poets, 'poet')
    count(path_title_freq, titles, 'title')
    count(path_vocab_freq, text_str, 'vocab')
    count(path_len_freq, text_lens, 'text_len')
Exemplo n.º 9
0
def featurize(path_data, path_sent, path_label, mode):
    sent1s = flat_read(path_data, 'text1')
    sent2s = flat_read(path_data, 'text2')
    labels = flat_read(path_data, 'label')
    sents = sent1s + sent2s
    sent_feats = sent2feat(sents, path_bow, path_svd, mode)
    sent_feats = merge(sent_feats)
    labels = np.array(labels)
    with open(path_sent, 'wb') as f:
        pk.dump(sent_feats, f)
    with open(path_label, 'wb') as f:
        pk.dump(labels, f)
Exemplo n.º 10
0
def vectorize(path_data, path_pair, path_label, mode):
    sent1s = flat_read(path_data, 'text1')
    sent2s = flat_read(path_data, 'text2')
    labels = flat_read(path_data, 'label')
    sents = sent1s + sent2s
    if mode == 'train':
        embed(sents, path_word2ind, path_word_vec, path_embed)
    pad_seq1s, pad_seq2s = align(sent1s), align(sent2s)
    pairs = (pad_seq1s, pad_seq2s)
    labels = np.array(labels)
    with open(path_pair, 'wb') as f:
        pk.dump(pairs, f)
    with open(path_label, 'wb') as f:
        pk.dump(labels, f)
Exemplo n.º 11
0
def featurize(path_data, path_sent, path_label, mode):
    sents = flat_read(path_data, 'text')
    labels = flat_read(path_data, 'label')
    sent_feats = sent2feat(sents, path_bow, path_svd, mode)
    if mode == 'train':
        label2ind(labels, path_label_ind)
    with open(path_label_ind, 'rb') as f:
        label_inds = pk.load(f)
    inds = list()
    for label in labels:
        inds.append(label_inds[label])
    inds = np.array(inds)
    with open(path_sent, 'wb') as f:
        pk.dump(sent_feats, f)
    with open(path_label, 'wb') as f:
        pk.dump(inds, f)
Exemplo n.º 12
0
def vectorize(path_data, path_sent, path_label, mode):
    sents = flat_read(path_data, 'text')
    labels = flat_read(path_data, 'label')
    if mode == 'train':
        embed(sents, path_word2ind, path_word_vec, path_embed)
        label2ind(labels, path_label_ind)
    pad_seqs = align(sents)
    with open(path_label_ind, 'rb') as f:
        label_inds = pk.load(f)
    inds = list()
    for label in labels:
        inds.append(label_inds[label])
    inds = np.array(inds)
    with open(path_sent, 'wb') as f:
        pk.dump(pad_seqs, f)
    with open(path_label, 'wb') as f:
        pk.dump(inds, f)
Exemplo n.º 13
0
def vectorize(paths, mode, update):
    texts = flat_read(paths['data'], 'text')
    flag_texts = add_flag(texts)
    if mode == 'train':
        if update:
            word2vec(flag_texts, path_word_vec)
        embed(flag_texts, path_word2ind, path_word_vec, path_embed)
    sents, labels = shift(flag_texts)
    align(sents, paths['cnn_sent'], extra=True)
    align(sents, paths['rnn_sent'], extra=False)
    align(labels, paths['label'], extra=False)
Exemplo n.º 14
0
def merge(names, path_slot_dir, path_extra, path_cut_word):
    entitys = names
    files = os.listdir(path_slot_dir)
    for file in files:
        words = load_word(os.path.join(path_slot_dir, file))
        entitys.extend(words)
    entity_strs = flat_read(path_extra, 'entity')
    for entity_str in entity_strs:
        words = entity_str.split()
        entitys.extend(words)
    entity_set = set(entitys)
    with open(path_cut_word, 'w') as f:
        for entity in entity_set:
            f.write(entity + '\n')
Exemplo n.º 15
0
def featurize(path_data, path_sent, mode):
    docs = flat_read(path_data, 'cut_doc')
    doc_words = [doc.split() for doc in docs]
    if mode == 'train':
        word2ind = Dictionary(doc_words)
        bow_docs = [word2ind.doc2bow(words) for words in doc_words]
        tfidf = Tfidf(bow_docs)
        with open(path_word2ind, 'wb') as f:
            pk.dump(word2ind, f)
        with open(path_tfidf, 'wb') as f:
            pk.dump(tfidf, f)
    else:
        with open(path_word2ind, 'rb') as f:
            word2ind = pk.load(f)
        with open(path_tfidf, 'rb') as f:
            tfidf = pk.load(f)
        bow_docs = [word2ind.doc2bow(words) for words in doc_words]
    tfidf_docs = tfidf[bow_docs]
    with open(path_sent, 'wb') as f:
        pk.dump(tfidf_docs, f)
Exemplo n.º 16
0
    model = Model([input1, input2, input3], output)
    return model


def load_model(name, embed_mat, seq_len):
    model = define_model(name, embed_mat, seq_len)
    model.load_weights(map_item(name, paths), by_name=True)
    return model


seq_len = 30

path_test = 'data/test.csv'
path_label = 'feat/label_test.pkl'
path_embed = 'feat/embed.pkl'
texts = flat_read(path_test, 'text')
with open(path_label, 'rb') as f:
    labels = pk.load(f)
with open(path_embed, 'rb') as f:
    embed_mat = pk.load(f)

path_test_pair = 'data/test_pair.csv'
path_pair = 'feat/pair_train.pkl'
path_flag = 'feat/flag_train.pkl'
text1s = flat_read(path_test_pair, 'text1')
text2s = flat_read(path_test_pair, 'text2')
with open(path_pair, 'rb') as f:
    pairs = pk.load(f)
with open(path_flag, 'rb') as f:
    flags = pk.load(f)
Exemplo n.º 17
0

def load_model(name, embed_mat, seq_len):
    model = define_model(name, embed_mat, seq_len)
    model.load_weights(map_item(name, paths))
    return model


seq_len = 30

detail = False

path_test = 'data/test.csv'
path_label = 'feat/label_test.pkl'
path_embed = 'feat/embed.pkl'
texts = flat_read(path_test, 'text')
with open(path_label, 'rb') as f:
    labels = pk.load(f)
with open(path_embed, 'rb') as f:
    embed_mat = pk.load(f)

class_num = len(ind_labels)

path_test_triple = 'data/test_triple.csv'
path_triple = 'feat/triple_test.pkl'
anc_texts = flat_read(path_test_triple, 'anc')
pos_texts = flat_read(path_test_triple, 'pos')
neg_texts = flat_read(path_test_triple, 'neg')
with open(path_triple, 'rb') as f:
    triples = pk.load(f)
Exemplo n.º 18
0
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from match import predict

from util import flat_read, map_item


path_test = 'data/test.csv'
texts = flat_read(path_test, 'text')
labels = flat_read(path_test, 'label')

label_set = sorted(list(set(labels)))

class_num = len(label_set)

paths = {'edit': 'metric/edit.csv',
         'cos': 'metric/cos.csv'}


def test(name, texts, labels, thre):
    preds = list()
    for text, label in zip(texts, labels):
        pred = predict(text, name, thre)
        preds.append(pred)
    precs = precision_score(labels, preds, average=None, labels=label_set)
    recs = recall_score(labels, preds, average=None, labels=label_set)
    with open(map_item(name, paths), 'w') as f:
        f.write('label,prec,rec' + '\n')
        for i in range(class_num):
            f.write('%s,%.2f,%.2f\n' % (label_set[i], precs[i], recs[i]))
    f1 = f1_score(labels, preds, average='weighted')
Exemplo n.º 19
0
from build import tensorize

from classify import ind_labels, models

from util import flat_read, map_item


device = torch.device('cpu')

detail = False

path_test = 'data/test.csv'
path_sent = 'feat/sent_test.pkl'
path_label = 'feat/label_test.pkl'
texts = flat_read(path_test, 'text')
with open(path_sent, 'rb') as f:
    sents = pk.load(f)
with open(path_label, 'rb') as f:
    labels = pk.load(f)

class_num = len(ind_labels)

paths = {'dnn': 'metric/dnn.csv',
         'cnn': 'metric/cnn.csv',
         'rnn': 'metric/rnn.csv'}


def test(name, sents, labels):
    sents, labels = tensorize([sents, labels], device)
    model = map_item(name, models)
Exemplo n.º 20
0
def fit(path_train):
    cut_texts = flat_read(path_train, 'cut_text')
    labels = flat_read(path_train, 'label')
    link_fit(cut_texts, labels, path_word_sent)
    freq_fit(cut_texts, path_bow, path_svd, path_sent_vec)
Exemplo n.º 21
0
def fit(path_train):
    cut_docs = flat_read(path_train, 'cut_doc')
    labels = flat_read(path_train, 'label')
    rank_fit(cut_docs, labels, path_rank)
    freq_fit(cut_docs, labels, path_freq, path_tfidf)
Exemplo n.º 22
0
seq_len = 30

path_stop_word = 'dict/stop_word.txt'
path_type_dir = 'dict/word_type'
path_homo = 'dict/homonym.csv'
path_syno = 'dict/synonym.csv'
stop_word_re = load_word_re(path_stop_word)
word_type_re = load_type_re(path_type_dir)
homo_dict = load_word_pair(path_homo)
syno_dict = load_word_pair(path_syno)

path_train = 'data/train.csv'
path_label = 'feat/label_train.pkl'
path_embed = 'feat/embed.pkl'
path_word2ind = 'model/word2ind.pkl'
texts = flat_read(path_train, 'text')
with open(path_label, 'rb') as f:
    labels = pk.load(f)
with open(path_embed, 'rb') as f:
    embed_mat = pk.load(f)
with open(path_word2ind, 'rb') as f:
    word2ind = pk.load(f)

paths = {'dnn': 'cache/dnn.pkl',
         'cnn': 'cache/cnn.pkl',
         'rnn': 'cache/rnn.pkl'}

caches = {'dnn': load_cache(map_item('dnn', paths)),
          'cnn': load_cache(map_item('dnn', paths)),
          'rnn': load_cache(map_item('dnn', paths))}