def generate_data(path, tokenizer, char_vcb, word_vcb, is_training=False): ''' Generate data ''' global root_path qp_pairs = data.load_from_file(path=path, is_training=is_training) tokenized_sent = 0 # qp_pairs = qp_pairs[:1000]1 for qp_pair in qp_pairs: tokenized_sent += 1 data.tokenize(qp_pair, tokenizer, is_training) for word in qp_pair['question_tokens']: word_vcb.add(word['word']) for char in word['word']: char_vcb.add(char) for word in qp_pair['passage_tokens']: word_vcb.add(word['word']) for char in word['word']: char_vcb.add(char) max_query_length = max(len(x['question_tokens']) for x in qp_pairs) max_passage_length = max(len(x['passage_tokens']) for x in qp_pairs) #min_passage_length = min(len(x['passage_tokens']) for x in qp_pairs) cfg.max_query_length = max_query_length cfg.max_passage_length = max_passage_length return qp_pairs
def predictAnswer(datas,model,output="results.txt"): log('predicting answer.') with open(output,"w",encoding='utf-8') as f: right=0 for line in datas: query=line['query'] passages=line['passages'] true_answer=line['answer'] log("predict answer: question ~ " + query) log("answer ~ " + true_answer) my_answers=[] scores=[] for passage in passages: my_answer,score=model.extractAnswer(query,passage['passage_text']) # print(a,score,answer in ps[i]['passage_text']) bm25_score=bm25_model.compute_score(tokenize(query),tokenize(passage['passage_text'])) scores.append(bm25_score) if my_answer is '': continue answers.append(my_answer) print(my_answer,bm25_score,true_answer in passage['passage_text']) if true_answer in my_answers: right+=1 f.write(query+"\tTrue Answer:"+true_answer+"\tPredict Answer:"+"\t".join(list(set(my_answers)))+"\n") log("right:%s/%s, acc:%s"%(right,len(datas),right/len(datas)))
def build_text(): iw,vocab,_=get_vocab() with open(text_path,'w',encoding='utf-8') as f: data=load_data() for post,resp in data: post=" ".join(tokenize(post[0],vocab=vocab)) resp=" ".join(tokenize(resp[0],vocab=vocab)) f.write(post+"\t"+resp+"\n")
def main(): dictionary = data.Corpus(args.data, cuda=args.cuda, yield_sentences=True, rng=None).dictionary if False: # Old way of loading a model with open(args.model, 'rb') as f: mdl = torch.load(f) print(mdl) else: mdl = model.load(args.model) mdl.softmax = nn.Softmax() mdl = mdl.cuda() if args.cuda else mdl.cpu() mdl.eval() sampler = Sampler(dictionary, mdl) seed_texts = [] if args.seed_text != '': seed_texts += [args.seed_text] if args.seed_file: with codecs.open(args.seed_file, 'r', 'utf-8') as f: seed_texts += [line.strip() for line in f] if seed_texts == []: seed_texts += [''] for seed_text in seed_texts: if args.print_seed_text: print(seed_text) if not args.seed_without_eos: seed_text = '<eos> ' + seed_text + ' <eos>' constraints = eval(args.constraint_list) tokenizer_fn = lambda s: dictionary.words_to_ids( data.tokenize(s, add_bos=False, add_eos=False), cuda=args.cuda) for c in constraints: if type(c) is SeedTextDictConstraint: c.set_seed_text(seed_text, tokenizer_fn, dictionary) if args.num_words: # Generate N words out_file = sys.stdout out_string = sampler.string(seed_text, args.prefix_text, args.num_words, constraints=constraints) for i, word in enumerate(out_string): out_file.write(word + ('\n' if i % 20 == 19 else ' ')) if i % 20 != 19: print('') else: # Beam search on sentences for batch in sampler.sentences(seed_text, args.prefix_text, constraints=constraints): for sent_tokens in batch: #print( "len = %d, %s" % (len(sent_tokens), ' '.join(sent_tokens)) ) print(' '.join(sent_tokens))
def sent2ids(sent): # sent=" ".join(jieba.lcut(sent)) # sent=" ".join(thu1.cut(sent,text=True)) sent = " ".join(pynlpir.segment(sent, pos_tagging=False)) words = tokenize(sent, vocab) print(words) ids = [vocab.get(w, 1) for w in words] print(ids) l = len(ids) return padding([ids], max_len=20), np.array([l]), np.array([20])
def reply(self, query): ''' Generate a reply using sampling. ''' # Perform preprocessing on input. if not isinstance(query, list): query = pad_tokens( wrap_utterance(tokenize(query.lower())[:MAX_NUM_TOKENS]), self.max_encoder_seq_length) input_sequence = np.zeros( (1, self.max_encoder_seq_length, self.num_encoder_tokens), dtype='float32') for i, token in enumerate(query): input_sequence[0, i, self.target_mapper.tok2num[token]] = 1 # Get decoder inputs/encoder outputs enc_out, enc_state = self.encoder_model.predict(input_sequence) dec_state = enc_state # Setup decoder inputs target_sequence = np.zeros((1, 1, self.num_decoder_tokens)) # Set first character of target sequence to the start token. target_sequence[0, 0, self.target_mapper.tok2num[START_UTTERANCE]] = 1 output = [] while True: # Predict output. output_tokens, attention, dec_state = self.decoder_model.predict( [enc_out, dec_state, target_sequence]) sample = np.argmax(output_tokens[0, -1, :]) word = self.target_mapper.num2tok[sample] if word == END_UTTERANCE or len( output) >= self.max_decoder_seq_length: break if word != START_UTTERANCE and word != END_UTTERANCE and word != PAD_TOKEN: output.append(word) target_sequence = np.zeros((1, 1, self.num_decoder_tokens)) target_sequence[0, 0, sample] = 1 return ' '.join(output)
def track(self, tracking_log_file_name=None, output_len_accuracy=False): result = [] for dialog in self.data: print '>>>' state = DialogState() output = [] d_data = zip(dialog.messages, dialog.actors, dialog.states) for msgs, actor, true_state in d_data: word_scores = collections.defaultdict(float) if actor == Dialog.ACTOR_USER: msgs = msgs[1:] print msgs[0] for msg, score in msgs: tokens = list(data.tokenize(msg)) for n_gram in tokens + zip(tokens, tokens[1:]) + zip( tokens, tokens[1:], tokens[2:]): word_scores[ngramstr(n_gram)] += np.exp(score) x = [] for ngram, score in sorted(word_scores.iteritems(), key=lambda x: x[1], reverse=True)[:5]: x.append((ngram, score)) self._update(state, x) if actor == Dialog.ACTOR_USER: output.append(state.export()) print ' S:', state.export() print ' TS:', true_state result.append({ 'session-id': dialog.session_id, 'turns': output }) return result
import sys sys.path.append('/data/test_rc/') import data import os test_rc_path = '/data/test_rc' for filename in os.listdir(test_rc_path): file = open(os.path.join(test_rc_path, filename)) while True: line = file.readline() token = data.tokenize(line) with open('tok_' + file, 'w') as w: w.write( for file in files open(file): tokens = data.tokenize(file) print(tokens) files = [] for i in os.listdir(path_to_folder): if i.endswith('.txt'): files.append(open(i)) # do what you want with all these open files
output, hidden = model(input, hidden) output = output.squeeze() output = softmax(output, dim=0) p = output[current_idx].data # 概率 total_p += math.log(p) #e为底 return math.exp(-total_p * (1 / sentence_len)) def evaluate(model, test_dataset, dict): ppl = 0 for sentence in test_dataset: ppl += evaluate_iter(model, sentence, dict) ppl = ppl / len(test_dataset) print("evaluation ppl:", ppl) return ppl if __name__ == '__main__': dataset = data.get_dataset(file_path) dict = data.build_dict(dataset) config.vocab_size = len(dict) train_dataset, test_dataset = data.split_data( dataset, train_proportion=config.train_proportion) train_tokens = data.tokenize(train_dataset, dict) model = RNNModel(config) train_batch_source = data.batchify(train_tokens, config.batch_size) #传入batchify好的数据直接训练 train(model, batch_source=train_batch_source) #test evaluate(model, test_dataset, dict)
noteStateMatrixToMidi(composition[song_idx], 'output/sample_' + str(song_idx)) for song_idx in range(real_comp.shape[0]): noteStateMatrixToMidi(real_comp[song_idx], 'output/real_sample_' + str(song_idx)) if __name__ == '__main__': inputs = tf.placeholder(tf.int32, shape=[None, hp.MAX_LEN]) labels = tf.placeholder(tf.int32, shape=[None, hp.MAX_LEN]) mask = tf.placeholder(tf.float32, shape=[None, hp.MAX_LEN]) dropout = tf.placeholder(tf.float32, shape=()) pieces, seqlens = load_pieces("data/roll/jsb8.pkl") token2idx, idx2token = build_vocab(pieces) pieces = tokenize(pieces, token2idx, idx2token) m = model.Model(inputs=inputs, labels=labels, mask=mask, dropout=dropout, token2idx=token2idx, idx2token=idx2token) # train(model=m, # pieces=pieces, # token2idx=token2idx, # epochs=500000, # save_name="model/jsb8_30/model_", # load_name="model/jsb8/model_0.0013442965-210500")\ # train(model=m, # pieces=pieces, # token2idx=token2idx,