def test():
    init_embedding = Vocab(WORD_VEC_100).word_vectors
    model = Model(2, 5620, 50, 100, 4, init_embedding=init_embedding)
    print model.embedding.get_shape()
    print model.W.get_shape()
    print model.b.get_shape()

    print model.lstm_fw_cell
    print model.lstm_bw_cell

    print model.unary_scores.get_shape()

    print model.loss.get_shape()
def test():
    init_embedding = Vocab(WORD_VEC_100,
                           train_word=2,
                           bi_gram=False,
                           single_task=False).word_vectors
    model = Model(2, 5620, 50, 100, 4, init_embedding=init_embedding)
    print(model.embedding.get_shape())
    print(model.W.get_shape())
    print(model.b.get_shape())

    print(model.fw_cell)
    print(model.bw_cell)

    print(model.unary_scores.get_shape())

    print(model.loss.get_shape())
import tensorflow as tf
import logging
import sys

from sklearn.metrics import accuracy_score

from voc import Vocab, OOV
from config import WORD_VEC_100, TRAIN_FILE, TEST_FILE, DEV_FILE, DATA_FILE, DROP_OUT, WORD_DICT, MODEL_TYPE, ADV_STATUS

from AdvMulti_model import MultiModel
import data_helpers

# ==================================================

init_embedding = Vocab(WORD_VEC_100,
                       WORD_DICT,
                       single_task=False,
                       bi_gram=True).word_vectors
tf.flags.DEFINE_integer("vocab_size", init_embedding.shape[0], "vocab_size")

# Data parameters
tf.flags.DEFINE_integer("word_dim", 100, "word_dim")
tf.flags.DEFINE_integer("lstm_dim", 100, "lstm_dim")
tf.flags.DEFINE_integer("num_classes", 4, "num_classes")
tf.flags.DEFINE_integer("num_corpus", 9, "num_corpus")
tf.flags.DEFINE_boolean("embed_status", True, "gate_status")
tf.flags.DEFINE_boolean("gate_status", False, "gate_status")
tf.flags.DEFINE_boolean("real_status", True, "real_status")
tf.flags.DEFINE_boolean("train", True, "train_status")

# Model Hyperparameters[t]
tf.flags.DEFINE_float("lr", 0.01, "learning rate (default: 0.01)")
Exemplo n.º 4
0
                        for other_word in other_words
                    ])

                    if (len(batch_x) >= batch_size):
                        yield (torch.tensor(batch_x, device=device),
                               torch.tensor(batch_context, device=device),
                               torch.tensor(batch_other, device=device))
                        batch_x.clear()
                        batch_context.clear()
                        batch_other.clear()


device = "cuda:2" if torch.cuda.is_available() else "cpu"

# vocab = Vocab.load_vocab("vocab_test")
vocab = Vocab.load_vocab("vocab_ori")
print(len(vocab))
skip_gram = SkipGram(len(vocab)).to(device)
adam = optim.Adam(skip_gram.parameters())
data_generator = DataGenerator(vocab)

epoch = 5
m = 2
k = 5
batch_size = 1024

for epoch_cnt in range(epoch):

    print("epoch", epoch_cnt)

    words_number = 0
from sklearn.metrics import accuracy_score, confusion_matrix

from voc import Vocab, OOV, Tag
from config import WORD_VEC_100, DROP_SINGLE, BI_DIRECTION, BI_GRAM, TASK_NAME, STACK_STATUS, LSTM_NET, WORD_SINGLE, TRAIN_PATH

from Baseline_model import Model
import data_helpers
import logging
from prepare_data_index import Data_index

# ==================================================
print('Generate words and characters need to be trained')
VOCABS = Vocab(WORD_VEC_100,
               WORD_SINGLE,
               single_task=True,
               bi_gram=BI_GRAM,
               frequency=5)
TAGS = Tag()
init_embedding = VOCABS.word_vectors
da_idx = Data_index(VOCABS, TAGS)
da_idx.process_all_data(BI_GRAM, multitask=False)

tf.flags.DEFINE_integer("vocab_size", init_embedding.shape[0], "vocab_size")

# Data parameters
tf.flags.DEFINE_integer("word_dim", 100, "word_dim")
tf.flags.DEFINE_integer("lstm_dim", 100, "lstm_dim")
tf.flags.DEFINE_integer("num_classes", 4, "num_classes")

# model names
                data.append(data_sentence)
                label.append(label_sentence)
                src_data_sentence = []
                data_sentence = []
                label_sentence = []
                continue
            # 先是原数据句子
            src_word = line_t[0]
            # 然后是,分割的单字(单词)
            word = line_t[1]
            src_data_sentence.append(src_word)
            data_sentence.append(word)
            # 最后是 tags
            label_sentence += [line_t[2].split('_')[0]]

        return src_data, data, label


if __name__ == '__main__':
    VOCABS = Vocab('/Users/liangs/Codes/insurance_data/insurance_wordvec.wv',
                   '../data/20_data_v3/vocab.txt',
                   single_task=False,
                   bi_gram=True,
                   frequency=0)
    TAGS = Tag()  # tag2idx
    init_embedding = VOCABS.word_vectors  # word/char embedding
    da_idx = Data_index(VOCABS, TAGS)  #
    for i in range(1, 21):
        path = "../data/20_data_v3/" + str(i)
        da_idx.process_all_data(path, path, True, multitask=False)
Exemplo n.º 7
0
from sklearn.metrics import accuracy_score
import argparse
import pathlib
from voc import Vocab, Tag
from config import MODEL_TYPE, ADV_STATUS

from AdvMulti_model import MultiModel
from utils import get_begin_end, get_match_size

import data_helpers

# ==================================================
#  numpy 数组形式的单字、单词向量
# 单字向量使用预训练向量,单词向量是字向量的简单组合
init_embedding = Vocab('../data/insurance_wordvec.wv',
                       '../data/20_data_v1/vocab.txt',
                       single_task=False,
                       bi_gram=True).word_vectors
parser = argparse.ArgumentParser()
parser.add_argument('--vocab_size', default=init_embedding.shape[0], type=int)

# Data parameters
parser.add_argument('--word_dim', default=100, type=int)
parser.add_argument('--lstm_dim', default=100, type=int)
parser.add_argument('--num_classes', default=4, type=int)
parser.add_argument('--num_corpus', default=20, type=int)
parser.add_argument('--embed_status', default=True, type=bool)
parser.add_argument('--gate_status', default=False, type=bool)
parser.add_argument('--embedding_trainable', default=True, type=bool)

# predict ? train ?
parser.add_argument('--predict', default=False, type=bool)
Exemplo n.º 8
0
import tensorflow as tf
from voc import Vocab, OOV, Tag
from model import Model
import data_helpers
import logging
from prepare_data_index import Data_index
from collections import defaultdict

from config import TASK_NAME, LOG_PATH, MODEL_DIR, Y_PRED, Y_TRUE, Y_SCORE
from config import TRAIN_DATA_UNI, DEV_DATA_UNI, TEST_DATA_UNI, TEST_WHO
from config import TRAIN_DATA_BI, DEV_DATA_BI, TEST_DATA_BI, MEMORY
from config import WINDOW_SIZE, BIGRAM, UNIGRAM_DIM, UNIGRAM

# ==================================================
print('Generate words and characters need to be trained')
VOCABS = Vocab()
TAGS = Tag()
uni_embedding = VOCABS.uni_vectors
bi_embedding = VOCABS.bi_vectors
da_idx = Data_index(VOCABS, TAGS)
da_idx.process_all_data()

# model names
tf.flags.DEFINE_string("model_name", "cws_"+TASK_NAME, "model name")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

# Load data
print("Loading data...")
test_file = TEST_DATA_BI