def infer(): tar_id2vocab, BOS_ID, EOS_ID = get_vocab(args.dataset, args.batch_size) vocab_size = len(tar_id2vocab) print(args) net = VAESeq2SeqInferModel(args.embed_dim, args.hidden_size, args.latent_size, vocab_size) model = paddle.Model(net) model.prepare() model.load(args.init_from_ckpt) infer_output = paddle.ones((args.batch_size, 1), dtype='int64') * BOS_ID space_token = ' ' line_token = '\n' with io.open(args.infer_output_file, 'w', encoding='utf-8') as out_file: predict_lines = model.predict_batch(infer_output)[0] for line in predict_lines: end_id = -1 if EOS_ID in line: end_id = np.where(line == EOS_ID)[0][0] new_line = [tar_id2vocab[e[0]] for e in line[:end_id]] out_file.write(space_token.join(new_line)) out_file.write(line_token)
def build_text(): iw,vocab,_=get_vocab() with open(text_path,'w',encoding='utf-8') as f: data=load_data() for post,resp in data: post=" ".join(tokenize(post[0],vocab=vocab)) resp=" ".join(tokenize(resp[0],vocab=vocab)) f.write(post+"\t"+resp+"\n")
def get_embedding(): emb_path="datasets/temp/embedding.np" if os.path.exists(emb_path): return np.load(open(emb_path,'rb')) else: model=KeyedVectors.load_word2vec_format(model_path,binary=True) iw,vocab,_=get_vocab() size=len(list(vocab.keys())) emb=np.zeros(shape=[size,emb_dim]) for word,index in vocab.items(): if index in [0,1] or word not in model.vocab: continue emb[index]=model[word] np.save(open(emb_path,"wb"),emb) return emb
def __init__(self, args): super(DCTTS, self).__init__() self.args = args self.embed = nn.Embedding(len(data.get_vocab(args.lang)), args.Ce, padding_idx=0) self.TextEnc = TextEncoder(d_in=args.Ce, d_out=args.Cx*2, d_hidden=args.Cx*2) self.AudioEnc = AudioEncoder(d_in=args.n_mels, d_out=args.Cx, d_hidden=args.Cx) self.Attention = DotProductAttention(d_hidden=args.Cx) self.AudioDec = AudioDecoder(d_in=args.Cx*2, d_out=args.n_mels, d_hidden=args.Cy) self.PostNet = PostNet(d_in=args.n_mels, d_out=args.n_mels, d_hidden=args.Cx) self.F0Enc = nn.Sequential( nn.Linear(1, 32), nn.ReLU(), nn.Linear(32, args.Cx*2), nn.Tanh(), )
import tensorflow as tf import numpy as np from data import get_vocab, load_train_data_pipe, data_iter_combine from model_combine import COMBINE sen_len = 40 label_num = 5 sparse_len = 140 crf_num = 10720 learning_rate = 0.02 batch_size = 10 num_epoch = 10 dropout = True print('read vocab ...') vocab_size, embedding_size, embedding, vocab_w2i = get_vocab() num_hidden = embedding_size label_onehot = {} label_hotone = {} label_onehot['b-person'] = 1 label_onehot['i-person'] = 2 label_onehot['b-organization'] = 3 label_onehot['i-organization'] = 4 label_onehot['o'] = 0 label_hotone[1] = 'b-person' label_hotone[2] = 'i-person' label_hotone[3] = 'b-organization' label_hotone[4] = 'i-organization' label_hotone[0] = 'o'
#coding:utf-8 import tensorflow as tf from data import get_train_data, get_vocab, split_data, response_len, post_len, padding import random import os from pprint import pprint import numpy as np import time id2w, w2id, freq = get_vocab() from emo_cls.classification import Classification from seq2seq_attention_9emo import Seq2SeqAttentionMinDis, Seq2SeqAttentionMaxDis, Seq2SeqAttentionEmoContent from seq2seq_attention_9emo import Seq2SeqAttentionHappy, Seq2SeqAttentionSad, Seq2SeqAttentionAnger, Seq2SeqAttentionDisgust from seq2seq_attention_9emo import Seq2SeqAttentionLike #,Seq2SeqAttentionSurprise,Seq2SeqAttentionFear train_datas, val_datas, test_datas = split_data() keys = ['posts', 'postLen', 'resps', 'respLen', 'resp_tfidf'] train_datas = [train_datas[k] for k in keys] val_datas = [val_datas[k] for k in keys] print("train num:%s" % len(train_datas[0])) seq_len = 20 batch_size = 128 D_step = 5 G_step = 1 is_debug = True # Emotion Classifier
import random import numpy as np train_data_file = 'D:\\nlp\\我的实验\\句子相似度\\sick\\SICK_train.txt' trial_data_file = 'D:\\nlp\\我的实验\\句子相似度\\sick\\SICK_trial.txt' test_data_file = 'D:\\nlp\\我的实验\\句子相似度\\sick\\SICK_test_annotated.txt' train_data = data.load_data(train_data_file) trial_data = data.load_data(trial_data_file) test_data = data.load_data(test_data_file) train_data = train_data + trial_data print('train data size: ', len(train_data)) print('test data size: ', len(test_data)) vocab, vocab_size, word_to_id, id_to_word, word_to_count = data.get_vocab( train_data + test_data) print('vocab size: ', vocab_size) word_to_senses_path = 'D:\\nlp\\我的实验\\多义词\\trained_40w\\word_to_sense.txt' #word_to_vector_path='D:\\nlp\\我的实验\\多义词\\trained_40w\\word_vectors.txt' sense_to_vector_path = 'D:\\nlp\\我的实验\\多义词\\trained_40w\\sense_vectors.txt' ass_vector = data.get_sense_ass(word_to_senses_path, sense_to_vector_path) init_emb = data.fill_with_gloves(word_to_id, ass_vector) #print(init_emb) print('Embedding Size: %d' % init_emb.shape[1]) train_ndata = data.convert_to_numeric(train_data, word_to_id) test_ndata = data.convert_to_numeric(test_data, word_to_id) #model
import jieba ##分词器 thulac import thulac thu1 = thulac.thulac(seg_only=True) #只进行分词,不进行词性标注 text = thu1.cut("我爱北京天安门", text=True) #进行一句话分词 print(text) #分词器 pynlpir import pynlpir pynlpir.open() s = '欢迎科研人员、技术工程师、企事业单位与个人参与NLPIR平台的建设工作。' pynlpir.segment(s, pos_tagging=False) _, vocab, _ = get_vocab() def sent2ids(sent): # sent=" ".join(jieba.lcut(sent)) # sent=" ".join(thu1.cut(sent,text=True)) sent = " ".join(pynlpir.segment(sent, pos_tagging=False)) words = tokenize(sent, vocab) print(words) ids = [vocab.get(w, 1) for w in words] print(ids) l = len(ids) return padding([ids], max_len=20), np.array([l]), np.array([20]) if __name__ == "__main__":
loss = train(model=model, optimizer=optimizer, input_variable=input_variable, target_variable=target_variable, criterion=criterion) print_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_epochs), iter, iter / n_epochs * 100, print_loss_avg)) source_vocab, target_vocab = data.get_vocab() # Actually training the model model = model_transformer.Transformer(queries_dim=64, keys_dim=64, values_dim=64, model_dim=512, num_encoder_layers=6, num_decoder_layers=6, n_source_vocab=source_vocab, n_target_vocab=target_vocab, num_encoder_heads=8, num_decoder_heads=8) if use_cuda: model = model.cuda()
from pprint import pprint from matplotlib import cm import matplotlib.pyplot as plt import data from data import get_vocab, get_padded_train_data, get_predicates, get_questions from word2vec import get_embedding import re import random base_weight_path = "./weights/" base_encoded_path = "./datasets/predict/encoded_data" question_len = data.question_len predicate_len = data.predicate_len id2w, vocab = get_vocab() size = len(vocab) embedding = get_embedding() embedding = embedding / np.sqrt( (np.sum(np.square(embedding), axis=-1, keepdims=True) + 1e-8)) #用这个效果好 #cos函数 def cosine(x1, x2): return K.sum(x1 * x2, axis=-1) / ( K.sqrt(K.sum(x1 * x1, axis=-1) * K.sum(x2 * x2, axis=-1) + 1e-12) ) #cos def neg_log_loss(x): cos1 = x[0]
group.add_argument("--meta_dev", type=str, default="metaphor_data/validation.csv") group.add_argument("--meta_test", type=str, default="metaphor_data/test.csv") group.add_argument("--output", type=str, default="output.tsv") args = vars(parser.parse_args()) logging.info(args) (meta_train, meta_dev, meta_test) = \ (args["meta_train"], args["meta_dev"], args["meta_test"]) # Set seed to combat random effects set_seed(args["seed"]) vocab, sentences = get_vocab(meta_train, meta_dev, meta_test) bert = args["model"] == "bert" # Metaphor data filenames if args["dev"]: (meta_test, meta_dev) = (meta_dev, meta_test) meta_train = get_metaphor_data(meta_train, args["batch_size"], args["k"], bert, train=True) meta_dev = get_metaphor_data(meta_dev, 8, args["k"], bert) meta_test = get_metaphor_data(meta_test, 8, args["k"], bert) # Initialise an empty model and train it.
if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) if __name__ == '__main__': build_word_index(args.word_embedding_file, args.src_vocab_file, args.tgt_file, args.tgt_vocab_file) src_vocab = get_vocab(args.src_vocab_file) src_vocab_size = len(src_vocab) src_unknown = src_vocab_size src_padding = src_vocab_size + 1 #print(len(src_vocab)) #print(vocab_size) tgt_vocab = get_vocab(args.tgt_vocab_file) tgt_vocab_size = len(tgt_vocab) tgt_unknown = tgt_vocab_size tgt_padding = tgt_vocab_size + 1 #print(tgt_vocab) embedding = load_word2vec_embedding(args.word_embedding_file, args.embedding_dim, src_vocab_size)