示例#1
0
def predict_text(model, enc_embedding):

    data_process = DataProcess(use_word2cut=False)

    dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m')
    dec_useful_words = tuple(dec_vec_model.wv.vocab.keys())

    prediction = model.predict_on_batch(enc_embedding)

    prediction_words_list = []
    for elem in prediction:
        prediction_words = []
        for vec in elem:
            dec_dis_list = []
            mse = calculate_mse(vec,
                                np.zeros(data_process.dec_embedding_length))
            dec_dis_list.append(mse)
            for dec_word in dec_useful_words:
                mse = calculate_mse(vec, dec_vec_model.wv[dec_word])
                dec_dis_list.append(mse)
            index = np.argmin(dec_dis_list)
            if index == 0:
                word = data_process.__VOCAB__[0]
            else:
                word = dec_useful_words[index - 1]
            prediction_words.append(word)
        prediction_words_list.append(prediction_words)

    return prediction_words_list
示例#2
0
def soccer_data_obj():
    return DataProcess(
        SOCCER_URL,
        key_column_index=1,
        spread_columns_indexes=(6, 8),
        validation_func=lambda line: len(line) == 10
    )
示例#3
0
def predict_one_text(model, enc_embedding):

    data_process = DataProcess(use_word2cut=False)

    dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m')
    dec_useful_words = list(dec_vec_model.wv.vocab.keys())

    prediction = model.predict(enc_embedding, verbose=0)

    prediction_words = []
    for vec in prediction[0]:
        dec_dis_list = []
        dec_dis = np.sqrt(
            np.sum(
                np.square(np.zeros(data_process.dec_embedding_length) - vec)))
        dec_dis_list.append(dec_dis)
        for dec_word in dec_useful_words:
            dec_dis = np.sqrt(
                np.sum(np.square(dec_vec_model.wv[dec_word] - vec)))
            dec_dis_list.append(dec_dis)
        index = np.argmin(dec_dis_list)
        if index == 0:
            word = data_process.__VOCAB__[0]
        else:
            word = dec_useful_words[index - 1]
        prediction_words.append(word)

    return prediction_words
示例#4
0
def get_vocabs(filename, type="train"):
    data_process = DataProcess(filename, type)
    dialog_iter = data_process.dialog_iter

    vocab = set()
    index = 0
    while True:
        # data -> (context, utterances, target_id)
        data = next(dialog_iter, None)

        if data is None:
            break

        example_id, speaker, context, utterances, target_id, candidates_id = data

        tokenized_context, _ = data_process.tokenize(context)
        tokenized_utterances, _ = data_process.tokenize(utterances)

        for context_sentence in tokenized_context:
            for i, c_word in enumerate(context_sentence):
                context_sentence[i] = c_word.lower()
            vocab.update(context_sentence)

        for utterances_sentence in tokenized_utterances:
            for i, u_word in enumerate(utterances_sentence):
                utterances_sentence[i] = u_word.lower()
            vocab.update(utterances_sentence)

        index += 1
        if index % 100 == 0:
            print(index, len(vocab))

    return vocab
示例#5
0
def data_to_padding_ids(text_list):

    data_process = DataProcess(use_word2cut=True)
    enc_vocab = data_process.read_vocabulary(data_process.enc_vocab_file)

    enc_padding_ids_list = []

    for text in text_list:

        words = data_process.text_cut_object.cut([text.strip()])
        words_list = words[0].strip().split()

        enc_ids = [
            enc_vocab.get(word, data_process.__UNK__) for word in words_list
        ]

        if len(enc_ids) > data_process.enc_input_length:
            enc_ids = enc_ids[:data_process.enc_input_length]

        enc_length = len(enc_ids)

        enc_padding_ids = []
        enc_padding_ids.extend([0] *
                               (data_process.enc_input_length - enc_length))
        enc_padding_ids.extend(
            [int(enc_ids[enc_length - l - 1]) for l in range(enc_length)])

        enc_padding_ids_list.append(np.array(enc_padding_ids))

    return np.array(enc_padding_ids_list)
def run1():

    # get data
    data_process = DataProcess()
    X, Y = data_process.get_all_data()
    X_train, Y_train = data_process.get_train_data()
    X_test, Y_test = data_process.get_test_data()

    # build model
    tfidf_nn = TFIDFNNClassifier(X)
    tfidf_nn.fit(X_train, Y_train, batch_size=128, epochs=10)
    tfidf_nn.save_model("../model/tfidf_nn_model_epoch10.h5")

    # convert character to numeric
    X_test = tfidf_nn.convert(X_test)

    # preict and evaluate
    prob = tfidf_nn.predict_prob(X_test)
    auc = tfidf_nn.auc(Y_test, prob)

    pred = tfidf_nn.predict(X_test)
    acc = tfidf_nn.accuracy(Y_test, pred)
    f1 = tfidf_nn.f1(Y_test, pred)
    cm = tfidf_nn.confusion_matrix(Y_test, pred)

    print "the accuracy is : " + str(acc)
    print "the auc is : " + str(auc)
    print "the f1 score is : " + str(f1)
    print "the confusion matrix is : \n"
    print cm
示例#7
0
def print_score(model, enc_embedding):
    data_process = DataProcess(use_word2cut=False)

    dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m')
    dec_useful_words = list(dec_vec_model.wv.vocab.keys())
    prediction = model.predict(enc_embedding, verbose=0)

    score_words = []

    for vec in prediction[0]:
        dec_sum = 0
        dec_dis_list = []
        dec_dis = np.sqrt(
            np.sum(
                np.square(np.zeros(data_process.dec_embedding_length) - vec)))
        dec_dis_list.append(dec_dis)
        dec_sum += dec_dis
        for dec_word in dec_useful_words:
            dec_dis = np.sqrt(
                np.sum(np.square(dec_vec_model.wv[dec_word] - vec)))
            dec_dis_list.append(dec_dis)
            dec_sum += dec_dis
        score_words.append(dec_dis_list / dec_sum)

    print(score_words)
示例#8
0
def run():
    batch_size = 63
    epochs = 5000
    
    data_process = DataProcess(use_word2cut=False)

    model = build_model()
  
    documents_length = data_process.get_documents_size(data_process.enc_ids_file, data_process.dec_ids_file)
    
    if batch_size > documents_length:
        print("ERROR--->" + u"语料数据量过少,请再添加一些")
        return None
    #自适应学习率
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, min_lr=1e-6, mode='min')
    '''monitor: 需要监视的量,val_loss,val_acc
    patience: 当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练
    verbose: 信息展示模式
    mode: 'auto','min','max'之一,在min模式训练,如果检测值停止下降则终止训练。在max模式下,当检测值不再上升的时候则停止训练。'''
    early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=2)
    model.fit_generator(generator=generate_batch(batch_size=batch_size),
                        steps_per_epoch=int(documents_length / batch_size)+5, \
                        validation_data=generate_batch(batch_size=batch_size), \
                        validation_steps=int(documents_length / batch_size)+5,\
                        epochs=epochs, verbose=1, workers=2, use_multiprocessing=True,
                        callbacks=[reduce_lr,early_stopping])

    model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
示例#9
0
def run1():

    # get data
    data_process = DataProcess()
    X, Y = data_process.get_all_data()
    X_train, Y_train = data_process.get_train_data()
    X_test, Y_test = data_process.get_test_data()

    # build model
    lstm_classifier = LSTMClassifier(X)
    lstm_classifier.fit(X_train, Y_train, batch_size=128, epochs=1)
    lstm_classifier.save_model("../model/lstm_model_epoch1.h5")

    # convert character to numeric
    X_test = lstm_classifier.convert(X_test)

    # predict and evaluate
    prob = lstm_classifier.predict_prob(X_test)
    auc = lstm_classifier.auc(Y_test, prob)

    pred = lstm_classifier.predict(X_test)
    acc = lstm_classifier.accuracy(Y_test, pred)
    f1 = lstm_classifier.f1(Y_test, pred)
    cm = lstm_classifier.confusion_matrix(Y_test, pred)

    print "the accuracy is : " + str(acc)
    print "the auc is : " + str(auc)
    print "the f1 score is : " + str(f1)
    print "the confusion matrix is : \n"
    print cm
示例#10
0
def run():

    data_process = DataProcess(use_word2cut=False)

    input_length = data_process.enc_input_length
    output_length = data_process.dec_output_length
    enc_embedding_length = data_process.enc_embedding_length
    dec_embedding_length = data_process.dec_embedding_length

    model = AttentionSeq2Seq(output_dim=dec_embedding_length, hidden_dim=data_process.hidden_dim, output_length=output_length, \
                             input_shape=(input_length, enc_embedding_length), \
                             batch_size=1, \
                             depth=data_process.layer_shape)

    model.compile(loss='mse', optimizer='rmsprop')

    model.load_weights("model/seq2seq_model_weights.h5")

    plot_model(model,
               to_file='model/seq2seq_model_structure.png',
               show_shapes=True,
               show_layer_names=True)

    text = u"碧水照嫩柳,桃花映春色"  #u"这围巾要火!"#u"你愿意嫁给我吗?"

    enc_padding_ids = data_to_padding_ids(text)
    enc_embedding = data_to_embedding(enc_padding_ids)

    prediction_words = predict_one_text(model, enc_embedding)

    print(prediction_words)

    print_score(model, enc_embedding)
示例#11
0
def save_pickle_data(filename, pickle_filename):
    data_process = DataProcess(filename)
    dialog_iter = data_process.create_dialogue_iter(
        data_process.input_file_path)

    index = 0
    with open(pickle_filename, 'wb') as f_handle:
        while True:
            # data -> (context, utterances, target_id)
            data = next(dialog_iter, None)

            if data is None:
                break

            context, utterances, target_id = data

            tokenized_context, _ = data_process.tokenize(context)
            tokenized_utterances, _ = data_process.tokenize(utterances)

            save_data = [tokenized_context, tokenized_utterances]
            pickle.dump(save_data, f_handle)

            index += 1

            if index % 100 == 0:
                print(index)

    print("%s data save complete!" % index)
示例#12
0
def process_partition(iterator):
    # 通过配置文件初始化DataProcess, DataProcess 主要是初始化hbase client
    conf = config.Config("process.conf")
    dp = DataProcess(conf)
    for line in iterator:
        dp.get_default("aaaaaaaa")
        result_lines = line.split(' ')
        for item in result_lines:
            yield item
示例#13
0
    def new_table(cls, data_table, columns):
        """
            to get a new  table from another table
        """

        data_process = DataProcess()

        data_table = data_process.new_table(data_table, columns)

        return data_table
示例#14
0
    def merge(cls, df1, df2, on, how="left"):
        """
            make two table merge to one table
        """

        data_process = DataProcess()

        data = data_process.merge(df1, df2, on=on, how=how)

        return data
示例#15
0
    def classify_data(cls, data, _type=""):
        """
            classify the  data
        """

        data_process = DataProcess()

        data = data_process.classification_data(data=data, _type=_type)

        return data
示例#16
0
    def get_student_data(cls, clean_data):
        """
            let data classify by student
        """

        data_process = DataProcess()

        student_list = data_process.get_student_data(clean_data)

        return student_list
示例#17
0
 def populateButtonClick(self):
     invalid = self.GetInvalidData()
     process = DataProcess(logf=self.logEntry.get(),
                           csvf=self.csvEntry.get(),
                           ontof=self.ontoEntry.get(),
                           periodf=self.periodEntry.get(),
                           invalid=invalid,
                           sender=self)
     process.Execute()
     return True
def main(train_src, train_tgt, val_src, val_tgt, test_src, test_tgt):
    #========================准备数据============================#
    data_obj = DataProcess(train_src, train_tgt, val_src, val_tgt, test_src,
                           test_tgt)  # 训练数据对象
    src_tgt, src_lang, tgt_lang = data_obj.get_src_tgt_data()
    *_, src_tgt_seq_train = data_obj.word_2_index('train', src_lang,
                                                  tgt_lang)  # 训练数据
    *_, src_tgt_seq_val = data_obj.word_2_index('val', src_lang,
                                                tgt_lang)  # 验证数据
    *_, src_tgt_seq_test = data_obj.word_2_index('test', src_lang,
                                                 tgt_lang)  # 测试数据

    # 打包批次数据
    train_data_loader = DataLoader(src_tgt_seq_train,
                                   param.batch_size,
                                   True,
                                   drop_last=False)
    val_data_loader = DataLoader(src_tgt_seq_val,
                                 param.batch_size,
                                 False,
                                 drop_last=False)
    test_data_loader = DataLoader(src_tgt_seq_test,
                                  param.infer_batch,
                                  True,
                                  drop_last=False)  # 批序列推理预测

    # ========================定义模型============================#
    transformer = Transformer(  # 定义transformer模型
        input_vocab_num=src_lang.n_words,
        target_vocab_num=tgt_lang.n_words,
        src_max_len=data_obj.src_max_len,
        tgt_max_len=data_obj.tgt_max_len).to(param.device)

    optimizer = SpecialOptimizer(  # 定义优化器, 返回优化器类的对象
        optimizer=torch.optim.Adam(filter(lambda x: x.requires_grad,
                                          transformer.parameters()),
                                   betas=(0.9, 0.98),
                                   eps=1e-09),
        warmup_steps=param.warmup_step,
        d_model=param.d_model)

    criterion = Criterion()  # 定义损失函数

    # 定义总的计算模型,开始训练(验证)和推理
    seq2seq = Sequence2Sequence(transformer=transformer,
                                optimizer=optimizer,
                                criterion=criterion)

    #========================训练(验证)模型=====================#
    seq2seq.train_val(train_data_loader, val_data_loader)

    #========================模型推理测试===========================#
    seq2seq.inference(test_data_loader, src_lang, tgt_lang,
                      data_obj.tgt_max_len)
示例#19
0
    def run_evaluate(self, sess, type, data_path, test_case=1):
        data_process = DataProcess(self.hparams,
                                   data_path,
                                   type,
                                   word2id=self.word2id,
                                   test_case=test_case)

        k_list = self.hparams.recall_k_list
        total_examples = 0
        total_correct = np.zeros([len(k_list)], dtype=np.int32)
        total_mrr = 0

        index = 0

        while True:
            batch_data = data_process.get_batch_data(
                self.hparams.dev_batch_size, 100)

            if batch_data is None:
                break

            (context, _), (utterances,
                           _), _, _, _, example_id, candidates_id = batch_data

            pred_val, _ = sess.run([self.predictions, self.logits],
                                   feed_dict=self.make_feed_dict(
                                       batch_data, 1.0))

            pred_val = np.asarray(pred_val)
            num_correct, num_examples = evaluate_recall(
                pred_val, batch_data[2], k_list)
            total_mrr += mean_reciprocal_rank(pred_val, batch_data[2])

            total_examples += num_examples
            total_correct = np.add(total_correct, num_correct)

            if num_correct[5] != self.hparams.dev_batch_size:
                print(example_id, ":", index, num_correct[5])

            index += 1
            if index % 500 == 0:
                accumulated_accuracy = (total_correct / total_examples) * 100
                print("index : ", index, " | ", accumulated_accuracy)

        avg_mrr = total_mrr / (self.hparams.dev_batch_size * index)
        recall_result = ""

        for i in range(len(k_list)):
            recall_result += "Recall@%s : " % k_list[i] + "%.2f%% | " % (
                (total_correct[i] / total_examples) * 100)
        self._logger.info(recall_result)
        self._logger.info("MRR: %.4f" % avg_mrr)

        return k_list, (total_correct / total_examples) * 100, avg_mrr
示例#20
0
    def get_student_list(self):

        ld = Loader()
        data_processing = DataProcess()
        course_initialization = Course_Initialization()

        clean_data = ld.load_csv(
            course_initialization.config.get_clean_data_csv_path())
        student_list = data_processing.get_student_data(clean_data)

        return student_list
示例#21
0
def make_valid_data(filename, write_file_name):

    data_process = DataProcess(filename)
    dialog_iter = data_process.create_dialogue_iter(
        data_process.input_file_path)

    input_sum_turn = 0
    input_sum_sentence_len = 0
    with open(write_file_name, "w", encoding='utf-8') as f_handle:
        index = 0

        while True:
            index += 1
            # data -> (context, utterances, target_id)
            data = next(dialog_iter, None)

            if data is None:
                break
            speakers, context, utterances, target_id = data
            context_sentence = context[0].split(" __eot__ ")

            f_handle.write("[%d]" % index + "\n")
            sum_sentence_len = 0
            tot_turn = 0

            for i, sentence in enumerate(context_sentence):
                sentence_len = len(nltk.word_tokenize(sentence))
                if len(sentence) == 0:
                    continue
                sentence_string = speakers[i] + " : " + sentence
                sentence_string = str(sentence_len) + "|" + sentence_string
                f_handle.write(sentence_string + "\n")

                sum_sentence_len += sentence_len
                tot_turn += 1

            avg_sentence_len = sum_sentence_len / tot_turn
            sentence_answer = "Answer : " + utterances[target_id[0]] + "\n"
            f_handle.write(sentence_answer)
            f_handle.write("average sentence length : %.3f" %
                           avg_sentence_len + "\n")
            f_handle.write("total turn number : %d" % tot_turn + '\n')

            f_handle.write("-" * 200 + "\n")
            if index % 500 == 0:
                print(index, ":", "avg_sentence_len - %.3f" % avg_sentence_len,
                      "tot_turn - %d" % tot_turn)
            input_sum_turn += tot_turn
            input_sum_sentence_len += avg_sentence_len

        f_handle.write("average sentence length %.3f" %
                       (input_sum_sentence_len / index))
        f_handle.write("average turn length %.3f" % (input_sum_turn / index))
示例#22
0
    def rename_columns(cls, data, re_columns, inplase=True):
        """
            rename the colums name
        """

        data_process = DataProcess()

        data = data_process.rename_columns(data=data,
                                           re_columns=re_columns,
                                           inplase=inplase)

        return data
示例#23
0
    def remove_columns(cls, data_table, columns, axis=1):
        """
            remove a columns from table
        """

        data_process = DataProcess()

        data_table = data_process.remove_columns(data_table=data_table,
                                                 columns=columns,
                                                 axis=axis)

        return data_table
示例#24
0
def generate_real_embedding(text_list):
    data_process = DataProcess(use_word2cut=True)
    dec_vocab = data_process.read_vocabulary(data_process.dec_vocab_file)

    dec_padding_ids_list = []

    for text in text_list:

        words = data_process.text_cut_object.cut([text.strip()])
        words_list = words[0].strip().split()

        dec_ids = [
            dec_vocab.get(word, data_process.__UNK__) for word in words_list
        ]

        if len(dec_ids) + 2 > data_process.dec_output_length:
            dec_ids = dec_ids[:data_process.dec_output_length - 2]

        dec_length = len(dec_ids)

        dec_padding_ids = []
        dec_padding_ids.extend([data_process.__GO__])
        dec_padding_ids.extend([int(dec_ids[l]) for l in range(dec_length)])
        dec_padding_ids.extend([data_process.__EOS__])
        dec_padding_ids.extend(
            [0] * (data_process.dec_output_length - dec_length - 2))

        dec_padding_ids_list.append(np.array(dec_padding_ids))

    padding_ids = np.array(dec_padding_ids_list)

    dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m')
    dec_useful_words = list(dec_vec_model.wv.vocab.keys())
    dec_reverse_vec = data_process.read_reverse_vocabulary(
        data_process.dec_vocab_file)

    all_dec_embedding = []
    for one_padding_ids in padding_ids:

        dec_embedding = []
        for data in one_padding_ids:
            word = dec_reverse_vec[data]
            if word in dec_useful_words:
                word_embedding = dec_vec_model.wv[word]
            elif word == data_process.__VOCAB__[0]:
                word_embedding = np.zeros(data_process.dec_embedding_length)
            else:
                word_embedding = np.array([1.0] *
                                          data_process.dec_embedding_length)
            dec_embedding.append(word_embedding)
        all_dec_embedding.append(dec_embedding)

    return np.array(all_dec_embedding)
示例#25
0
def monitor(mail_user, mail_pass, mail_to, mail_host):
    before = time.time()
    print("price monitor start.")

    dataprocess = DataProcess('test.db', 'goods')
    send_message = SendMessage()

    ret = dataprocess.sync_with_csv('in.csv')
    if ret is False:
        print("sync with csv file error.")
        return False

    # dataprocess.add_from_csv('in.csv')

    mail_content = ""
    html_parse = HtmlParse()

    ret, goods_data = dataprocess.get_goods()
    if ret is False:
        html_parse.driver.quit()
        return
    for x in goods_data:
        url = x['url']
        price = x['price']
        ret, data = html_parse.get_goods_data(url)
        if ret is False:
            continue
        price_now, goods_name = data
        print("price_now:" + str(price_now))
        if price_now < price or price is None or price <= 0:
            ret = dataprocess.update_good(
                url, price_now, goods_name,
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            if ret is False:
                continue
            mail_content = mail_content + goods_name\
                + "\n降价为:" + str(price_now) + "\n"
    if len(mail_content) != 0:
        mail_sub = "降价通知"
        mailto_list = [mail_to]
        mail_port = 465

        send_message.mail_init(mail_host, mail_user, mail_pass, mail_port)
        send_message.send_mail(mailto_list, mail_sub, mail_content)
    else:
        print("There were no good which price has reduced.")

    html_parse.quit()

    # dataprocess.export_csv("out.csv")
    after = time.time()
    print('Thread ended, asumed time : %.2f' % (after - before))
示例#26
0
def calculate_mse(src_vec, des_vec):
    data_process = DataProcess(use_word2cut=False)

    std_number = np.std(des_vec)
    if (std_number - data_process.epsilon) < 0:
        norm_des_vec = np.zeros(data_process.dec_embedding_length)
    else:
        norm_des_vec = (des_vec - np.mean(des_vec)) / std_number

    err = np.square(src_vec - norm_des_vec)
    mse = np.sum(err)

    return mse
示例#27
0
    def filter_data(cls, origin_log_data, field, field_value=''):
        """
            filter value for a columns
        """

        data_process = DataProcess()

        filter_data_done = data_process.filter_data(
            origin_log_data=origin_log_data,
            field=field,
            field_value=field_value)

        return filter_data_done
示例#28
0
 def __init__(self, mode=0):
     self.mode = mode
     data_process = DataProcess()
     self.text_cut_object = data_process.text_cut_object
     
     if self.mode == 0:
         self.embedding_length = data_process.enc_embedding_length
         self.file = data_process.enc_file
     else:
         self.embedding_length = data_process.dec_embedding_length
         self.file = data_process.dec_file
         self.__GO__ = data_process.__VOCAB__[1]
         self.__EOS__ = data_process.__VOCAB__[2] 
示例#29
0
 def test_get_min_spread_from_file_error_on_parse(
         self, mock_get_data, mock_log, obj_args):
     data_obj = DataProcess(*obj_args)
     mock_get_data.return_value = (
         '       Team            P     W    L   D    F      A     Pts\n'
         '    1. Arsenal         38    26   9   3    79  -  36    87\n'
     )
     data_obj.get_min_spread_from_file()
     mock_get_data.assert_called_once()
     mock_log.assert_called_once_with(
         'Error on parsing the data - please check '
         'properties of DataProcess creation.', exc_info=True
     )
示例#30
0
    def __init__(self, conf, model_dir=".cache/model"):
        self.conf = conf
        self.data_processor = DataProcess()
        self.models = {}
        self.model_dir = model_dir

        con = self.data_processor.connect_db(conf.db_host, conf.db_database,
                                             conf.db_user, conf.db_pass)
        classes = self.data_processor.get_big_class(con)
        print(classes)
        for index, cls in classes.iterrows():
            system = cls['business_system_code']
            subclass = cls['rule_type_code']
            self.init(system, subclass)