def predict_text(model, enc_embedding): data_process = DataProcess(use_word2cut=False) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = tuple(dec_vec_model.wv.vocab.keys()) prediction = model.predict_on_batch(enc_embedding) prediction_words_list = [] for elem in prediction: prediction_words = [] for vec in elem: dec_dis_list = [] mse = calculate_mse(vec, np.zeros(data_process.dec_embedding_length)) dec_dis_list.append(mse) for dec_word in dec_useful_words: mse = calculate_mse(vec, dec_vec_model.wv[dec_word]) dec_dis_list.append(mse) index = np.argmin(dec_dis_list) if index == 0: word = data_process.__VOCAB__[0] else: word = dec_useful_words[index - 1] prediction_words.append(word) prediction_words_list.append(prediction_words) return prediction_words_list
def soccer_data_obj(): return DataProcess( SOCCER_URL, key_column_index=1, spread_columns_indexes=(6, 8), validation_func=lambda line: len(line) == 10 )
def predict_one_text(model, enc_embedding): data_process = DataProcess(use_word2cut=False) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = list(dec_vec_model.wv.vocab.keys()) prediction = model.predict(enc_embedding, verbose=0) prediction_words = [] for vec in prediction[0]: dec_dis_list = [] dec_dis = np.sqrt( np.sum( np.square(np.zeros(data_process.dec_embedding_length) - vec))) dec_dis_list.append(dec_dis) for dec_word in dec_useful_words: dec_dis = np.sqrt( np.sum(np.square(dec_vec_model.wv[dec_word] - vec))) dec_dis_list.append(dec_dis) index = np.argmin(dec_dis_list) if index == 0: word = data_process.__VOCAB__[0] else: word = dec_useful_words[index - 1] prediction_words.append(word) return prediction_words
def get_vocabs(filename, type="train"): data_process = DataProcess(filename, type) dialog_iter = data_process.dialog_iter vocab = set() index = 0 while True: # data -> (context, utterances, target_id) data = next(dialog_iter, None) if data is None: break example_id, speaker, context, utterances, target_id, candidates_id = data tokenized_context, _ = data_process.tokenize(context) tokenized_utterances, _ = data_process.tokenize(utterances) for context_sentence in tokenized_context: for i, c_word in enumerate(context_sentence): context_sentence[i] = c_word.lower() vocab.update(context_sentence) for utterances_sentence in tokenized_utterances: for i, u_word in enumerate(utterances_sentence): utterances_sentence[i] = u_word.lower() vocab.update(utterances_sentence) index += 1 if index % 100 == 0: print(index, len(vocab)) return vocab
def data_to_padding_ids(text_list): data_process = DataProcess(use_word2cut=True) enc_vocab = data_process.read_vocabulary(data_process.enc_vocab_file) enc_padding_ids_list = [] for text in text_list: words = data_process.text_cut_object.cut([text.strip()]) words_list = words[0].strip().split() enc_ids = [ enc_vocab.get(word, data_process.__UNK__) for word in words_list ] if len(enc_ids) > data_process.enc_input_length: enc_ids = enc_ids[:data_process.enc_input_length] enc_length = len(enc_ids) enc_padding_ids = [] enc_padding_ids.extend([0] * (data_process.enc_input_length - enc_length)) enc_padding_ids.extend( [int(enc_ids[enc_length - l - 1]) for l in range(enc_length)]) enc_padding_ids_list.append(np.array(enc_padding_ids)) return np.array(enc_padding_ids_list)
def run1(): # get data data_process = DataProcess() X, Y = data_process.get_all_data() X_train, Y_train = data_process.get_train_data() X_test, Y_test = data_process.get_test_data() # build model tfidf_nn = TFIDFNNClassifier(X) tfidf_nn.fit(X_train, Y_train, batch_size=128, epochs=10) tfidf_nn.save_model("../model/tfidf_nn_model_epoch10.h5") # convert character to numeric X_test = tfidf_nn.convert(X_test) # preict and evaluate prob = tfidf_nn.predict_prob(X_test) auc = tfidf_nn.auc(Y_test, prob) pred = tfidf_nn.predict(X_test) acc = tfidf_nn.accuracy(Y_test, pred) f1 = tfidf_nn.f1(Y_test, pred) cm = tfidf_nn.confusion_matrix(Y_test, pred) print "the accuracy is : " + str(acc) print "the auc is : " + str(auc) print "the f1 score is : " + str(f1) print "the confusion matrix is : \n" print cm
def print_score(model, enc_embedding): data_process = DataProcess(use_word2cut=False) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = list(dec_vec_model.wv.vocab.keys()) prediction = model.predict(enc_embedding, verbose=0) score_words = [] for vec in prediction[0]: dec_sum = 0 dec_dis_list = [] dec_dis = np.sqrt( np.sum( np.square(np.zeros(data_process.dec_embedding_length) - vec))) dec_dis_list.append(dec_dis) dec_sum += dec_dis for dec_word in dec_useful_words: dec_dis = np.sqrt( np.sum(np.square(dec_vec_model.wv[dec_word] - vec))) dec_dis_list.append(dec_dis) dec_sum += dec_dis score_words.append(dec_dis_list / dec_sum) print(score_words)
def run(): batch_size = 63 epochs = 5000 data_process = DataProcess(use_word2cut=False) model = build_model() documents_length = data_process.get_documents_size(data_process.enc_ids_file, data_process.dec_ids_file) if batch_size > documents_length: print("ERROR--->" + u"语料数据量过少,请再添加一些") return None #自适应学习率 reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, min_lr=1e-6, mode='min') '''monitor: 需要监视的量,val_loss,val_acc patience: 当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练 verbose: 信息展示模式 mode: 'auto','min','max'之一,在min模式训练,如果检测值停止下降则终止训练。在max模式下,当检测值不再上升的时候则停止训练。''' early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=2) model.fit_generator(generator=generate_batch(batch_size=batch_size), steps_per_epoch=int(documents_length / batch_size)+5, \ validation_data=generate_batch(batch_size=batch_size), \ validation_steps=int(documents_length / batch_size)+5,\ epochs=epochs, verbose=1, workers=2, use_multiprocessing=True, callbacks=[reduce_lr,early_stopping]) model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
def run1(): # get data data_process = DataProcess() X, Y = data_process.get_all_data() X_train, Y_train = data_process.get_train_data() X_test, Y_test = data_process.get_test_data() # build model lstm_classifier = LSTMClassifier(X) lstm_classifier.fit(X_train, Y_train, batch_size=128, epochs=1) lstm_classifier.save_model("../model/lstm_model_epoch1.h5") # convert character to numeric X_test = lstm_classifier.convert(X_test) # predict and evaluate prob = lstm_classifier.predict_prob(X_test) auc = lstm_classifier.auc(Y_test, prob) pred = lstm_classifier.predict(X_test) acc = lstm_classifier.accuracy(Y_test, pred) f1 = lstm_classifier.f1(Y_test, pred) cm = lstm_classifier.confusion_matrix(Y_test, pred) print "the accuracy is : " + str(acc) print "the auc is : " + str(auc) print "the f1 score is : " + str(f1) print "the confusion matrix is : \n" print cm
def run(): data_process = DataProcess(use_word2cut=False) input_length = data_process.enc_input_length output_length = data_process.dec_output_length enc_embedding_length = data_process.enc_embedding_length dec_embedding_length = data_process.dec_embedding_length model = AttentionSeq2Seq(output_dim=dec_embedding_length, hidden_dim=data_process.hidden_dim, output_length=output_length, \ input_shape=(input_length, enc_embedding_length), \ batch_size=1, \ depth=data_process.layer_shape) model.compile(loss='mse', optimizer='rmsprop') model.load_weights("model/seq2seq_model_weights.h5") plot_model(model, to_file='model/seq2seq_model_structure.png', show_shapes=True, show_layer_names=True) text = u"碧水照嫩柳,桃花映春色" #u"这围巾要火!"#u"你愿意嫁给我吗?" enc_padding_ids = data_to_padding_ids(text) enc_embedding = data_to_embedding(enc_padding_ids) prediction_words = predict_one_text(model, enc_embedding) print(prediction_words) print_score(model, enc_embedding)
def save_pickle_data(filename, pickle_filename): data_process = DataProcess(filename) dialog_iter = data_process.create_dialogue_iter( data_process.input_file_path) index = 0 with open(pickle_filename, 'wb') as f_handle: while True: # data -> (context, utterances, target_id) data = next(dialog_iter, None) if data is None: break context, utterances, target_id = data tokenized_context, _ = data_process.tokenize(context) tokenized_utterances, _ = data_process.tokenize(utterances) save_data = [tokenized_context, tokenized_utterances] pickle.dump(save_data, f_handle) index += 1 if index % 100 == 0: print(index) print("%s data save complete!" % index)
def process_partition(iterator): # 通过配置文件初始化DataProcess, DataProcess 主要是初始化hbase client conf = config.Config("process.conf") dp = DataProcess(conf) for line in iterator: dp.get_default("aaaaaaaa") result_lines = line.split(' ') for item in result_lines: yield item
def new_table(cls, data_table, columns): """ to get a new table from another table """ data_process = DataProcess() data_table = data_process.new_table(data_table, columns) return data_table
def merge(cls, df1, df2, on, how="left"): """ make two table merge to one table """ data_process = DataProcess() data = data_process.merge(df1, df2, on=on, how=how) return data
def classify_data(cls, data, _type=""): """ classify the data """ data_process = DataProcess() data = data_process.classification_data(data=data, _type=_type) return data
def get_student_data(cls, clean_data): """ let data classify by student """ data_process = DataProcess() student_list = data_process.get_student_data(clean_data) return student_list
def populateButtonClick(self): invalid = self.GetInvalidData() process = DataProcess(logf=self.logEntry.get(), csvf=self.csvEntry.get(), ontof=self.ontoEntry.get(), periodf=self.periodEntry.get(), invalid=invalid, sender=self) process.Execute() return True
def main(train_src, train_tgt, val_src, val_tgt, test_src, test_tgt): #========================准备数据============================# data_obj = DataProcess(train_src, train_tgt, val_src, val_tgt, test_src, test_tgt) # 训练数据对象 src_tgt, src_lang, tgt_lang = data_obj.get_src_tgt_data() *_, src_tgt_seq_train = data_obj.word_2_index('train', src_lang, tgt_lang) # 训练数据 *_, src_tgt_seq_val = data_obj.word_2_index('val', src_lang, tgt_lang) # 验证数据 *_, src_tgt_seq_test = data_obj.word_2_index('test', src_lang, tgt_lang) # 测试数据 # 打包批次数据 train_data_loader = DataLoader(src_tgt_seq_train, param.batch_size, True, drop_last=False) val_data_loader = DataLoader(src_tgt_seq_val, param.batch_size, False, drop_last=False) test_data_loader = DataLoader(src_tgt_seq_test, param.infer_batch, True, drop_last=False) # 批序列推理预测 # ========================定义模型============================# transformer = Transformer( # 定义transformer模型 input_vocab_num=src_lang.n_words, target_vocab_num=tgt_lang.n_words, src_max_len=data_obj.src_max_len, tgt_max_len=data_obj.tgt_max_len).to(param.device) optimizer = SpecialOptimizer( # 定义优化器, 返回优化器类的对象 optimizer=torch.optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), warmup_steps=param.warmup_step, d_model=param.d_model) criterion = Criterion() # 定义损失函数 # 定义总的计算模型,开始训练(验证)和推理 seq2seq = Sequence2Sequence(transformer=transformer, optimizer=optimizer, criterion=criterion) #========================训练(验证)模型=====================# seq2seq.train_val(train_data_loader, val_data_loader) #========================模型推理测试===========================# seq2seq.inference(test_data_loader, src_lang, tgt_lang, data_obj.tgt_max_len)
def run_evaluate(self, sess, type, data_path, test_case=1): data_process = DataProcess(self.hparams, data_path, type, word2id=self.word2id, test_case=test_case) k_list = self.hparams.recall_k_list total_examples = 0 total_correct = np.zeros([len(k_list)], dtype=np.int32) total_mrr = 0 index = 0 while True: batch_data = data_process.get_batch_data( self.hparams.dev_batch_size, 100) if batch_data is None: break (context, _), (utterances, _), _, _, _, example_id, candidates_id = batch_data pred_val, _ = sess.run([self.predictions, self.logits], feed_dict=self.make_feed_dict( batch_data, 1.0)) pred_val = np.asarray(pred_val) num_correct, num_examples = evaluate_recall( pred_val, batch_data[2], k_list) total_mrr += mean_reciprocal_rank(pred_val, batch_data[2]) total_examples += num_examples total_correct = np.add(total_correct, num_correct) if num_correct[5] != self.hparams.dev_batch_size: print(example_id, ":", index, num_correct[5]) index += 1 if index % 500 == 0: accumulated_accuracy = (total_correct / total_examples) * 100 print("index : ", index, " | ", accumulated_accuracy) avg_mrr = total_mrr / (self.hparams.dev_batch_size * index) recall_result = "" for i in range(len(k_list)): recall_result += "Recall@%s : " % k_list[i] + "%.2f%% | " % ( (total_correct[i] / total_examples) * 100) self._logger.info(recall_result) self._logger.info("MRR: %.4f" % avg_mrr) return k_list, (total_correct / total_examples) * 100, avg_mrr
def get_student_list(self): ld = Loader() data_processing = DataProcess() course_initialization = Course_Initialization() clean_data = ld.load_csv( course_initialization.config.get_clean_data_csv_path()) student_list = data_processing.get_student_data(clean_data) return student_list
def make_valid_data(filename, write_file_name): data_process = DataProcess(filename) dialog_iter = data_process.create_dialogue_iter( data_process.input_file_path) input_sum_turn = 0 input_sum_sentence_len = 0 with open(write_file_name, "w", encoding='utf-8') as f_handle: index = 0 while True: index += 1 # data -> (context, utterances, target_id) data = next(dialog_iter, None) if data is None: break speakers, context, utterances, target_id = data context_sentence = context[0].split(" __eot__ ") f_handle.write("[%d]" % index + "\n") sum_sentence_len = 0 tot_turn = 0 for i, sentence in enumerate(context_sentence): sentence_len = len(nltk.word_tokenize(sentence)) if len(sentence) == 0: continue sentence_string = speakers[i] + " : " + sentence sentence_string = str(sentence_len) + "|" + sentence_string f_handle.write(sentence_string + "\n") sum_sentence_len += sentence_len tot_turn += 1 avg_sentence_len = sum_sentence_len / tot_turn sentence_answer = "Answer : " + utterances[target_id[0]] + "\n" f_handle.write(sentence_answer) f_handle.write("average sentence length : %.3f" % avg_sentence_len + "\n") f_handle.write("total turn number : %d" % tot_turn + '\n') f_handle.write("-" * 200 + "\n") if index % 500 == 0: print(index, ":", "avg_sentence_len - %.3f" % avg_sentence_len, "tot_turn - %d" % tot_turn) input_sum_turn += tot_turn input_sum_sentence_len += avg_sentence_len f_handle.write("average sentence length %.3f" % (input_sum_sentence_len / index)) f_handle.write("average turn length %.3f" % (input_sum_turn / index))
def rename_columns(cls, data, re_columns, inplase=True): """ rename the colums name """ data_process = DataProcess() data = data_process.rename_columns(data=data, re_columns=re_columns, inplase=inplase) return data
def remove_columns(cls, data_table, columns, axis=1): """ remove a columns from table """ data_process = DataProcess() data_table = data_process.remove_columns(data_table=data_table, columns=columns, axis=axis) return data_table
def generate_real_embedding(text_list): data_process = DataProcess(use_word2cut=True) dec_vocab = data_process.read_vocabulary(data_process.dec_vocab_file) dec_padding_ids_list = [] for text in text_list: words = data_process.text_cut_object.cut([text.strip()]) words_list = words[0].strip().split() dec_ids = [ dec_vocab.get(word, data_process.__UNK__) for word in words_list ] if len(dec_ids) + 2 > data_process.dec_output_length: dec_ids = dec_ids[:data_process.dec_output_length - 2] dec_length = len(dec_ids) dec_padding_ids = [] dec_padding_ids.extend([data_process.__GO__]) dec_padding_ids.extend([int(dec_ids[l]) for l in range(dec_length)]) dec_padding_ids.extend([data_process.__EOS__]) dec_padding_ids.extend( [0] * (data_process.dec_output_length - dec_length - 2)) dec_padding_ids_list.append(np.array(dec_padding_ids)) padding_ids = np.array(dec_padding_ids_list) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = list(dec_vec_model.wv.vocab.keys()) dec_reverse_vec = data_process.read_reverse_vocabulary( data_process.dec_vocab_file) all_dec_embedding = [] for one_padding_ids in padding_ids: dec_embedding = [] for data in one_padding_ids: word = dec_reverse_vec[data] if word in dec_useful_words: word_embedding = dec_vec_model.wv[word] elif word == data_process.__VOCAB__[0]: word_embedding = np.zeros(data_process.dec_embedding_length) else: word_embedding = np.array([1.0] * data_process.dec_embedding_length) dec_embedding.append(word_embedding) all_dec_embedding.append(dec_embedding) return np.array(all_dec_embedding)
def monitor(mail_user, mail_pass, mail_to, mail_host): before = time.time() print("price monitor start.") dataprocess = DataProcess('test.db', 'goods') send_message = SendMessage() ret = dataprocess.sync_with_csv('in.csv') if ret is False: print("sync with csv file error.") return False # dataprocess.add_from_csv('in.csv') mail_content = "" html_parse = HtmlParse() ret, goods_data = dataprocess.get_goods() if ret is False: html_parse.driver.quit() return for x in goods_data: url = x['url'] price = x['price'] ret, data = html_parse.get_goods_data(url) if ret is False: continue price_now, goods_name = data print("price_now:" + str(price_now)) if price_now < price or price is None or price <= 0: ret = dataprocess.update_good( url, price_now, goods_name, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) if ret is False: continue mail_content = mail_content + goods_name\ + "\n降价为:" + str(price_now) + "\n" if len(mail_content) != 0: mail_sub = "降价通知" mailto_list = [mail_to] mail_port = 465 send_message.mail_init(mail_host, mail_user, mail_pass, mail_port) send_message.send_mail(mailto_list, mail_sub, mail_content) else: print("There were no good which price has reduced.") html_parse.quit() # dataprocess.export_csv("out.csv") after = time.time() print('Thread ended, asumed time : %.2f' % (after - before))
def calculate_mse(src_vec, des_vec): data_process = DataProcess(use_word2cut=False) std_number = np.std(des_vec) if (std_number - data_process.epsilon) < 0: norm_des_vec = np.zeros(data_process.dec_embedding_length) else: norm_des_vec = (des_vec - np.mean(des_vec)) / std_number err = np.square(src_vec - norm_des_vec) mse = np.sum(err) return mse
def filter_data(cls, origin_log_data, field, field_value=''): """ filter value for a columns """ data_process = DataProcess() filter_data_done = data_process.filter_data( origin_log_data=origin_log_data, field=field, field_value=field_value) return filter_data_done
def __init__(self, mode=0): self.mode = mode data_process = DataProcess() self.text_cut_object = data_process.text_cut_object if self.mode == 0: self.embedding_length = data_process.enc_embedding_length self.file = data_process.enc_file else: self.embedding_length = data_process.dec_embedding_length self.file = data_process.dec_file self.__GO__ = data_process.__VOCAB__[1] self.__EOS__ = data_process.__VOCAB__[2]
def test_get_min_spread_from_file_error_on_parse( self, mock_get_data, mock_log, obj_args): data_obj = DataProcess(*obj_args) mock_get_data.return_value = ( ' Team P W L D F A Pts\n' ' 1. Arsenal 38 26 9 3 79 - 36 87\n' ) data_obj.get_min_spread_from_file() mock_get_data.assert_called_once() mock_log.assert_called_once_with( 'Error on parsing the data - please check ' 'properties of DataProcess creation.', exc_info=True )
def __init__(self, conf, model_dir=".cache/model"): self.conf = conf self.data_processor = DataProcess() self.models = {} self.model_dir = model_dir con = self.data_processor.connect_db(conf.db_host, conf.db_database, conf.db_user, conf.db_pass) classes = self.data_processor.get_big_class(con) print(classes) for index, cls in classes.iterrows(): system = cls['business_system_code'] subclass = cls['rule_type_code'] self.init(system, subclass)