def analyse_doc(self, testFile):
        #定义清洗分词后的文档名,并进行清洗分词预处理
        targetfilename1 = testFile.split('.')[0] + '_cut_split.txt'
        preprocess.preprocess_doc(testFile, targetfilename1)

        #定义去停词后的文档名,并进行去停词处理
        targetfilename2 = targetfilename1.split('.')[0] + '_stop.txt'
        stopwords.stopWord_doc(targetfilename1, targetfilename2)

        #重新整理句子,并进行预测
        sentences = []
        for line in open(targetfilename2, "r"):
            sentences.append(line)
        ans = self.model.predict(sentences)
        return ans
    def test_doc(self, testFile, category=None, process=True):
        '''
        :param trainFile: 进行测试的文档
        :param category: 该文档所属类别,从cate_dict中选择
        :param process: 该输入文档是否需要预处理
        :return: 测试结果
        '''

        if process == True:
            #定义清洗分词后的文档名,并进行清洗分词预处理
            targetfilename1 = testFile.split('.')[0] + '_cut_split.txt'
            preprocess.preprocess_doc(testFile, targetfilename1)

            # 定义去停词后的文档名,并进行去停词处理
            targetfilename2 = targetfilename1.split('.')[0] + '_stop.txt'
            stopwords.stopWord_doc(targetfilename1, targetfilename2)

            #对预处理后的文档加上标签
            testFile = targetfilename1.split('.')[0] + '_labeled.txt'
            prepare_data.prepare_data(targetfilename2, testFile, category)

        #测试
        ans = self.model.test(testFile)
        return ans
    def train(self, trainFile, category, process=True):
        '''
        :param trainFile: 进行训练的文档
        :param category: 该文档所属类别,从cate_dict中选择
        :param process: 该输入文档是否需要预处理
        :return:
        '''
        if process == True:
            #定义清洗分词后的文档名,并进行清洗分词预处理
            targetfilename1 = trainFile.split('.')[0] + '_cut_split.txt'
            preprocess.preprocess_doc(trainFile, targetfilename1)

            # 定义去停词后的文档名,并进行去停词处理
            targetfilename2 = targetfilename1.split('.')[0] + '_stop.txt'
            stopwords.stopWord_doc(targetfilename1, targetfilename2)

            #对预处理后的文档加上标签
            trainFile = targetfilename1.split('.')[0] + '_labeled.txt'
            prepare_data.prepare_data(targetfilename2, trainFile, category)

        #训练
        classifier = fasttext.supervised(trainFile,
                                         'model/classifier2.model',
                                         label_prefix='__label__')
    def categorize(self, raw_doc):
        # preprocess and transform
        doc = preprocess_doc(raw_doc[0], self.stop_words)
        doc_input = embedding_lookup([doc], self.vectorizer.w2v_embeddings,
                                     self.seq_length, self.embed_size)

        with tf.Session(graph=self.graph) as session:
            #session.run(tf.global_variables_initializer())
            #self.saver = tf.train.import_meta_graph(
            #    paths.checkpoint + "/" + self.model.name + "-" + str(self.num_epochs) + ".meta", clear_devices=True)
            self.saver.restore(
                session, paths.checkpoint + "/" + self.model.name + "-" +
                str(self.num_epochs))

            # run the network for each document
            #predicted_labels = list()
            #for doc_input in doc_inputs:
            #    [predicted_label] = session.run([self.model.prediction], {self.model.inputs: doc_input})
            #    predicted_labels.append(predicted_label[0])
            [predicted_label] = session.run([self.model.prediction],
                                            {self.model.inputs: doc_input})

            # return the predicted class
            return classes[predicted_label[0]]
Exemplo n.º 5
0
    user_input = input("Make your query:\n")

    rm_sw = False if input(
        "Stopwords were removed during training? [Yes/No]: \n").lower(
        ) == 'no' else True

    stem = False if input("Stemming was applied during training? [Yes/No]: \n"
                          ).lower() == 'no' else True

    if rm_sw:
        filter_funcs.append(pp.remove_stopwords)
    if stem:
        filter_funcs.append(pp.stem_text)

    query = preprocess_doc(string_to_dict(user_input),
                           filter_funcs=filter_funcs)['words']

    print(query)

    alphas_user = input(
        "Try different starting learning rates (comma separated):\n")

    alphas = [float(a) for a in alphas_user.split(',')]

    steps_user = input(
        "Try differnte number of steps for inference (comma separated): \n")

    steps = [int(s) for s in steps_user.split(',')]

    top_k = int(input("How many recommendations? : \n"))