def test(): init_embedding = Vocab(WORD_VEC_100).word_vectors model = Model(2, 5620, 50, 100, 4, init_embedding=init_embedding) print model.embedding.get_shape() print model.W.get_shape() print model.b.get_shape() print model.lstm_fw_cell print model.lstm_bw_cell print model.unary_scores.get_shape() print model.loss.get_shape()
def test(): init_embedding = Vocab(WORD_VEC_100, train_word=2, bi_gram=False, single_task=False).word_vectors model = Model(2, 5620, 50, 100, 4, init_embedding=init_embedding) print(model.embedding.get_shape()) print(model.W.get_shape()) print(model.b.get_shape()) print(model.fw_cell) print(model.bw_cell) print(model.unary_scores.get_shape()) print(model.loss.get_shape())
import tensorflow as tf import logging import sys from sklearn.metrics import accuracy_score from voc import Vocab, OOV from config import WORD_VEC_100, TRAIN_FILE, TEST_FILE, DEV_FILE, DATA_FILE, DROP_OUT, WORD_DICT, MODEL_TYPE, ADV_STATUS from AdvMulti_model import MultiModel import data_helpers # ================================================== init_embedding = Vocab(WORD_VEC_100, WORD_DICT, single_task=False, bi_gram=True).word_vectors tf.flags.DEFINE_integer("vocab_size", init_embedding.shape[0], "vocab_size") # Data parameters tf.flags.DEFINE_integer("word_dim", 100, "word_dim") tf.flags.DEFINE_integer("lstm_dim", 100, "lstm_dim") tf.flags.DEFINE_integer("num_classes", 4, "num_classes") tf.flags.DEFINE_integer("num_corpus", 9, "num_corpus") tf.flags.DEFINE_boolean("embed_status", True, "gate_status") tf.flags.DEFINE_boolean("gate_status", False, "gate_status") tf.flags.DEFINE_boolean("real_status", True, "real_status") tf.flags.DEFINE_boolean("train", True, "train_status") # Model Hyperparameters[t] tf.flags.DEFINE_float("lr", 0.01, "learning rate (default: 0.01)")
for other_word in other_words ]) if (len(batch_x) >= batch_size): yield (torch.tensor(batch_x, device=device), torch.tensor(batch_context, device=device), torch.tensor(batch_other, device=device)) batch_x.clear() batch_context.clear() batch_other.clear() device = "cuda:2" if torch.cuda.is_available() else "cpu" # vocab = Vocab.load_vocab("vocab_test") vocab = Vocab.load_vocab("vocab_ori") print(len(vocab)) skip_gram = SkipGram(len(vocab)).to(device) adam = optim.Adam(skip_gram.parameters()) data_generator = DataGenerator(vocab) epoch = 5 m = 2 k = 5 batch_size = 1024 for epoch_cnt in range(epoch): print("epoch", epoch_cnt) words_number = 0
from sklearn.metrics import accuracy_score, confusion_matrix from voc import Vocab, OOV, Tag from config import WORD_VEC_100, DROP_SINGLE, BI_DIRECTION, BI_GRAM, TASK_NAME, STACK_STATUS, LSTM_NET, WORD_SINGLE, TRAIN_PATH from Baseline_model import Model import data_helpers import logging from prepare_data_index import Data_index # ================================================== print('Generate words and characters need to be trained') VOCABS = Vocab(WORD_VEC_100, WORD_SINGLE, single_task=True, bi_gram=BI_GRAM, frequency=5) TAGS = Tag() init_embedding = VOCABS.word_vectors da_idx = Data_index(VOCABS, TAGS) da_idx.process_all_data(BI_GRAM, multitask=False) tf.flags.DEFINE_integer("vocab_size", init_embedding.shape[0], "vocab_size") # Data parameters tf.flags.DEFINE_integer("word_dim", 100, "word_dim") tf.flags.DEFINE_integer("lstm_dim", 100, "lstm_dim") tf.flags.DEFINE_integer("num_classes", 4, "num_classes") # model names
data.append(data_sentence) label.append(label_sentence) src_data_sentence = [] data_sentence = [] label_sentence = [] continue # 先是原数据句子 src_word = line_t[0] # 然后是,分割的单字(单词) word = line_t[1] src_data_sentence.append(src_word) data_sentence.append(word) # 最后是 tags label_sentence += [line_t[2].split('_')[0]] return src_data, data, label if __name__ == '__main__': VOCABS = Vocab('/Users/liangs/Codes/insurance_data/insurance_wordvec.wv', '../data/20_data_v3/vocab.txt', single_task=False, bi_gram=True, frequency=0) TAGS = Tag() # tag2idx init_embedding = VOCABS.word_vectors # word/char embedding da_idx = Data_index(VOCABS, TAGS) # for i in range(1, 21): path = "../data/20_data_v3/" + str(i) da_idx.process_all_data(path, path, True, multitask=False)
from sklearn.metrics import accuracy_score import argparse import pathlib from voc import Vocab, Tag from config import MODEL_TYPE, ADV_STATUS from AdvMulti_model import MultiModel from utils import get_begin_end, get_match_size import data_helpers # ================================================== # numpy 数组形式的单字、单词向量 # 单字向量使用预训练向量,单词向量是字向量的简单组合 init_embedding = Vocab('../data/insurance_wordvec.wv', '../data/20_data_v1/vocab.txt', single_task=False, bi_gram=True).word_vectors parser = argparse.ArgumentParser() parser.add_argument('--vocab_size', default=init_embedding.shape[0], type=int) # Data parameters parser.add_argument('--word_dim', default=100, type=int) parser.add_argument('--lstm_dim', default=100, type=int) parser.add_argument('--num_classes', default=4, type=int) parser.add_argument('--num_corpus', default=20, type=int) parser.add_argument('--embed_status', default=True, type=bool) parser.add_argument('--gate_status', default=False, type=bool) parser.add_argument('--embedding_trainable', default=True, type=bool) # predict ? train ? parser.add_argument('--predict', default=False, type=bool)
import tensorflow as tf from voc import Vocab, OOV, Tag from model import Model import data_helpers import logging from prepare_data_index import Data_index from collections import defaultdict from config import TASK_NAME, LOG_PATH, MODEL_DIR, Y_PRED, Y_TRUE, Y_SCORE from config import TRAIN_DATA_UNI, DEV_DATA_UNI, TEST_DATA_UNI, TEST_WHO from config import TRAIN_DATA_BI, DEV_DATA_BI, TEST_DATA_BI, MEMORY from config import WINDOW_SIZE, BIGRAM, UNIGRAM_DIM, UNIGRAM # ================================================== print('Generate words and characters need to be trained') VOCABS = Vocab() TAGS = Tag() uni_embedding = VOCABS.uni_vectors bi_embedding = VOCABS.bi_vectors da_idx = Data_index(VOCABS, TAGS) da_idx.process_all_data() # model names tf.flags.DEFINE_string("model_name", "cws_"+TASK_NAME, "model name") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() # Load data print("Loading data...") test_file = TEST_DATA_BI