示例#1
0
def load_data(dirFolder, testRatio, featureKeepRatio=1.0):
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i: classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        splitPoint = int(testRatio * len(documents))
        trainDocs, testDocs = documents[splitPoint:], documents[:splitPoint]
        allDocs.append([trainDocs, testDocs])
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), trainDocs,
                      featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X_train, Y_train = [], []
    X_test, Y_test = [], []
    for i, dclass in enumerate(classes):
        for j in range(len(allDocs[i])):
            for doc in allDocs[i][j]:
                processedFile = preprocess.readFile(
                    os.path.join(os.path.join(dirFolder, dclass), doc))
                words = Counter(processedFile)
                features = [words.get(w, 0) for w in vocabulary]
                if j == 0:
                    X_train.append(features)
                    Y_train.append(i)
                else:
                    X_test.append(features)
                    Y_test.append(i)
    return (np.stack(X_train), Y_train), (np.stack(X_test), Y_test)
示例#2
0
 def init_all_states(self):
     self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config)
     self.tf_idf = TfIdf(self.config)
     # TODO: wait for models
     self.cluster_model = joblib.load(self.cluster_md)
     self.vec_model = Doc2Vec.load(self.vec_md)
     jieba.initialize()
示例#3
0
def load_data_word2vec(dirFolder, featureKeepRatio=1.0):
    model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i:classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        allDocs.append(documents)
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X = []
    def getIt(dablu):
        try:
            return model[dablu]
        except:
            return np.zeros((300,))

    for i, dclass in enumerate(classes):
        for doc in allDocs[i]:
            processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc))
            words = list(set(processedFile))
            features = [ getIt(w) for w in vocabulary]
            X.append(features)
    return np.stack(X)
示例#4
0
 def init_all_states(self):
     self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config)
     self.tf_idf = TfIdf(self.config)
     self.cluster_model = joblib.load(self.cluster_md)
     # self.vec_model = Doc2Vec.load(self.vec_md)
     # self.vec_model = BertClient()
     self.load_stop_words(self.config)
     jieba.initialize()
示例#5
0
def load_data(dirFolder, featureKeepRatio=1.0):
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i:classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        allDocs.append(documents)
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X = []
    for i, dclass in enumerate(classes):
        for doc in allDocs[i]:
            processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc))
            words = Counter(processedFile)
            features = [ words.get(w, 0) for w in vocabulary]
            X.append(features)
    return np.stack(X)
示例#6
0
import datetime
from inverted_index import InvertedIndex
from tf_idf import TfIdf
import pandas as pd

custom_separator = "###"

if __name__ == '__main__':
    df=pd.read_csv('../data/stories.csv', sep=',',header=None)
    df_head = pd.read_csv('../data/db_books.csv', sep=',')
    
    array_content = df.values
    # array_segment = array_content[0:3]
    
    # INVERTED INDEX
    print("Init INVERTED INDEX")
    begin_time = datetime.datetime.now()
    invindex = InvertedIndex(array_content)
    invindex.process()
    print(datetime.datetime.now() - begin_time)   
    invindex.saveIndexInDisc()
    
    # TF IDF
    print("Init TF IDF")
    begin_time = datetime.datetime.now()
    tf_idf = TfIdf(array_content)
    tf_idf.process()
    print(datetime.datetime.now() - begin_time)   
    tf_idf.saveIndexInDisc()
示例#7
0
 
 parser = PreProcess()
 parsed_trainning_documents = {}
 print('processing...')
 for k, v in reader.train.items():
   parsed_trainning_documents[k] = parser.process(v)
 
 # Entrada para o tf-idf, devemos anotar os documentos com suas classes.
 # Receberá como entrada um array de tuplas: ([tokens], classe)
 parsed_trainning_documents_with_classes = []
 for k in parsed_trainning_documents.keys():
   parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]]
 
 # Execução tf-idf
 print('generating tf.idf...')
 tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes)
 tf_idf_calculator.run()
 
 # testa os parâmetros do knn: métrica de distância e valor de K
 for metric in ['cosine', 'euclid']:
   for k in range(5, 11, 2):
     knn = KNN(tf_idf_calculator.results, k, metric)
   
     # confusion_matrix[A][B] = quantas vezes um documento da classe A foi atribuído à classe B
     topics = ['baseball', 'christian', 'guns']
     confusion_matrix = {topic:{t:0 for t in topics} for topic in topics}
     
     print_log = False
     i = 0
     ytrue = []
     ypred = []
示例#8
0
class Agent:
    good_qualified_corpus = set()

    def __init__(self):
        self.config = config
        self.stop_words = ''
        self.punctuation_str = ''.join(self.config.punctuation_ls)
        self.frequency_domain_dict = frequency_domain.frequency_dict
        self.cluster_md = self.config.cluster_model
        # self.vec_md = self.config.doc_vector_model
        self.init_all_states()
        self.fuzzy_weight = 0.2
        self.tf_idf_weight = 0.8
        self.good_corpus_threshold = 200
        self.good_corpus_score = 0.95

    def init_all_states(self):
        self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config)
        self.tf_idf = TfIdf(self.config)
        self.cluster_model = joblib.load(self.cluster_md)
        # self.vec_model = Doc2Vec.load(self.vec_md)
        # self.vec_model = BertClient()
        self.load_stop_words(self.config)
        jieba.initialize()

    def get_utterance_type(self, utterance):
        # tmp_vector = self.vec_model.infer_vector(utterance)

        # tmp_vector = self.vec_model.encode([utterance])
        # label = self.cluster_model.predict(tmp_vector)
        # print(label)
        # return self.config.cluster_file[label[0]]
        return self.config.cluster_file[0]

    def record_good_conversations(self, utterance, score_ls, context_ls):
        def write_conversations():
            localtime = (time.asctime(time.localtime(time.time()))).replace(
                ' ', '_').replace(':', '-')
            with open(self.config.path_of_good_conversation + localtime,
                      'wb') as wfp:
                pickle.dump(Agent.good_qualified_corpus, wfp)
            Agent.good_qualified_corpus.clear()
            # print(Agent.good_qualified_corpus)

        for index in range(len(score_ls)):
            if score_ls[index] > self.good_corpus_score:
                if context_ls[index][0] and context_ls[index][1]:
                    # print((utterance, context_ls[index][1]))
                    Agent.good_qualified_corpus.add(
                        (utterance, context_ls[index][1]))
        # print(len(Agent.good_qualified_corpus))
        if len(Agent.good_qualified_corpus) > self.good_corpus_threshold:
            record_thread = threading.Thread(target=write_conversations)
            record_thread.start()

    def random_chose_index(self, score_ls, max_score):
        max_score_indexes = []
        for i in range(len(score_ls)):
            if score_ls[i] == max_score:
                max_score_indexes.append(i)
        return choice(max_score_indexes)

    def load_stop_words(self, config):
        with open(config.stop_words, 'rb') as fpr:
            self.stop_words = pickle.load(fpr)

    def remove_special_words(self, stop_words_ls, input_sentence):
        sentence = input_sentence
        for special_word in self.config.special_modal_words:
            if special_word in sentence:
                sentence = sentence.replace(special_word, '')
        return sentence

    def response_answer(self, reply_msg, max_score):
        if type(max_score) is np.ndarray:
            final_max_score = max_score[0][0]
        else:
            final_max_score = max_score
        return reply_msg, final_max_score

    def get_answer(self, utterance, file_name=None):
        try:
            utterance = utterance.rstrip(self.punctuation_str)
            file_name = self.get_utterance_type(utterance)

            self.retrieval.read_indexes(file_name)
            context_ls = self.retrieval.search_sentences(
                utterance, self.stop_words)
            if not context_ls:
                return "", 0
            utterance_no_stop = self.remove_special_words(
                self.stop_words, utterance)
            new_context_ls = []
            for each_context in context_ls:
                ques = self.remove_special_words(self.stop_words,
                                                 each_context[0])
                ans = self.remove_special_words(self.stop_words,
                                                each_context[1])
                if not ques or not ans:
                    new_context_ls.append((0, 0))
                    continue
                new_context_ls.append((ques, ans))
            # print("control!!!!!!!!!!!!!!!!!: {},{}".format(utterance, new_context_ls))
            # print(len(new_context_ls))
            fuzzy_ratio_ls = fuzzy_matching(utterance_no_stop, new_context_ls)

            self.tf_idf.select_model(file_name)
            self.tf_idf.predict_tfidf(utterance_no_stop, new_context_ls)
            tf_idf_score_ls = self.tf_idf.calculate_distances()

            if fuzzy_ratio_ls.count(max(fuzzy_ratio_ls)) > 1:
                fuzzy_best_index = self.random_chose_index(
                    fuzzy_ratio_ls, max(fuzzy_ratio_ls))
            else:
                fuzzy_best_index = fuzzy_ratio_ls.index(max(fuzzy_ratio_ls))

            if tf_idf_score_ls.count(max(tf_idf_score_ls)) > 1:
                tftdf_best_index = self.random_chose_index(
                    tf_idf_score_ls, max(tf_idf_score_ls))
            else:
                tftdf_best_index = tf_idf_score_ls.index(max(tf_idf_score_ls))

            fuzzy_best_content = context_ls[fuzzy_best_index][0].rstrip(
                self.punctuation_str)
            tfidf_best_content = context_ls[tftdf_best_index][0].rstrip(
                self.punctuation_str)
            if fuzzy_best_content == utterance or utterance.strip(''.join(
                    config.special_modal_words)) in fuzzy_best_content:
                best_index = fuzzy_best_index
                # return context_ls[best_index][1], max(fuzzy_ratio_ls)
                return self.response_answer(context_ls[best_index][1],
                                            max(fuzzy_ratio_ls))

            if tfidf_best_content == utterance or utterance.strip(''.join(
                    config.special_modal_words)) in tfidf_best_content:
                best_index = tftdf_best_index
                # return context_ls[best_index][1], max(tf_idf_score_ls)
                return self.response_answer(context_ls[best_index][1],
                                            max(tf_idf_score_ls))

            final_score_ls = [(fuzzy_ratio * self.fuzzy_weight +
                               tf_tdf_score * self.tf_idf_weight)
                              for fuzzy_ratio, tf_tdf_score in zip(
                                  fuzzy_ratio_ls, tf_idf_score_ls)]
            # TODO: find a suitable weight
            self.record_good_conversations(utterance, final_score_ls,
                                           context_ls)
            max_score = max(final_score_ls)
            if final_score_ls.count(max_score) > 1:
                best_index = self.random_chose_index(final_score_ls, max_score)
            else:
                best_index = final_score_ls.index(max_score)
            # print("final result:{}".format(context_ls[best_index]))
            # print(type(max_score))
            return self.response_answer(context_ls[best_index][1], max_score)
        except Exception as e:
            return "", 0

    def test(self, utterance):
        answer = self.get_answer(utterance)
        return answer

    def start_cmd(self):
        while True:
            utterance = input(">>>")
            if utterance.strip() == "exit1":
                break
            answer, score = self.get_answer(utterance)
            print("<<<{}:{}".format(answer, score))

    def api(self, utterance):
        answer, score = self.get_answer(utterance)
        return [answer, score]

    def socket_get(self, utterance):
        answer, score = self.get_answer(utterance)
        # print(answer + '---' + str(score[0][0]))
        return answer + '---' + str(score)
示例#9
0
import pandas as pd
from preprocessing import InputFrame
from tf_idf import TfIdf
from NMF import LatentFeatures
import cPickle as pk

filename = '../../StateNames.csv'
dataframe = pd.read_csv(filename)

#Run the male preprocessing and model
male_instance = InputFrame(dataframe, 'M')
male_instance.clean_data_from_frame()
male_frame = male_instance.gender_frame

male_tfidf = TfIdf(male_frame)
male_vectors = male_tfidf.tfidf_matrix()

male_nmf = LatentFeatures(male_vectors)
male_matrices = male_nmf.fit_model()

# male_nmf.print_W_H_features(male_tfidf.index, male_tfidf.features, 20)
male_features_dict = male_nmf.latent_features_dict(male_tfidf.index,
                                                   male_tfidf.features, 20)

#Store the male model and latent features dictionary
with open('../data/male_nmf.pkl', 'w') as f:
    pk.dump(male_nmf, f)

with open('../data/male_latent_features.pkl', 'w') as f:
    pk.dump(male_features_dict, f)
示例#10
0
def calc_doc_len(doc: Document, tf_idf: TfIdf) -> float:
    result = 0
    for word in doc.words:
        result = result + math.pow(tf_idf.get(word, {}).get(doc, 0), 2)
    return math.sqrt(result)
示例#11
0
import pandas as pd
from preprocessing import InputFrame
from tf_idf import TfIdf
from NMF import LatentFeatures
import cPickle as pk

filename = '../../StateNames.csv'
dataframe = pd.read_csv(filename)

#Run the male preprocessing and model
male_instance = InputFrame(dataframe,'M')
male_instance.clean_data_from_frame()
male_frame = male_instance.gender_frame

male_tfidf = TfIdf(male_frame)
male_vectors = male_tfidf.tfidf_matrix()

male_nmf = LatentFeatures(male_vectors)
male_matrices = male_nmf.fit_model()

# male_nmf.print_W_H_features(male_tfidf.index, male_tfidf.features, 20)
male_features_dict = male_nmf.latent_features_dict(male_tfidf.index, male_tfidf.features, 20)

#Store the male model and latent features dictionary
with open('../data/male_nmf.pkl','w') as f:
    pk.dump(male_nmf,f)

with open('../data/male_latent_features.pkl','w') as f:
    pk.dump(male_features_dict,f)

示例#12
0
class Agent:
    def __init__(self):
        self.config = config
        self.punctuation_str = ''.join(self.config.punctuation_ls)
        self.frequency_domain_dict = frequency_domain.frequency_dict
        self.cluster_md = self.config.cluster_model
        self.vec_md = self.config.doc_vector_model
        self.init_all_states()
        self.fuzzy_weight = 0.7
        self.tf_idf_weight = 0.3
        # self.record_chat_ls = []

    def init_all_states(self):
        self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config)
        self.tf_idf = TfIdf(self.config)
        # TODO: wait for models
        self.cluster_model = joblib.load(self.cluster_md)
        self.vec_model = Doc2Vec.load(self.vec_md)
        jieba.initialize()

    def select_domain(self, utterance):
        utterance_words = [each_word for each_word in jieba.cut(utterance, cut_all=False)]
        # print(utterance_words)
        for each_word in utterance_words:
            if each_word in self.frequency_domain_dict.keys() and len(each_word) > 1:
                # print(each_word)
                return "domains"
        return "xiaohuangji"

    def get_utterance_type(self, utterance):
        # TODO: wait for models
        tmp_vector = self.vec_model.infer_vector(utterance)
        label = self.cluster_model.predict(tmp_vector.reshape(1, -1))
        # print(label)
        return self.config.cluster_file[label[0]]

    def record_good_chat(self):
        pass       # TODO: build a new thread to record conversation whose score is more than 0.95 in interval time
                   # TODO: by this way we can get a lot of good conversations

    def random_chose_index(self, score_ls, max_score):
        max_score_indexes = []
        for i in range(len(score_ls)):
            if score_ls[i] == max_score:
                max_score_indexes.append(i)
        return choice(max_score_indexes)

    def get_answer(self, utterance, file_name=None):
        try:
            # file_name = self.get_utterance_type(utterance)
            utterance = utterance.rstrip(self.punctuation_str)
            if not file_name:
                file_name = self.select_domain(utterance)
            # print(file_name)

            # file_name = self.get_utterance_type(utterance)
            self.retrieval.read_indexes(file_name)
            context_ls = self.retrieval.search_sentences(utterance)
            # print(context_ls)
            if not context_ls and file_name != "domains":
                return "对不起亲,没听懂你说啥,你再重新组织一下语言吧。"
            if not context_ls and file_name == "domains":
                answer = self.get_answer(utterance, "weibo")
                return answer

            if file_name == "domains":
                fuzzy_ratio_ls = fuzzy_for_domains(utterance, context_ls)
            else:
                fuzzy_ratio_ls = fuzzy_matching(utterance, context_ls)
            # print(fuzzy_ratio_ls)

            self.tf_idf.select_model(file_name)
            self.tf_idf.predict_tfidf(utterance, context_ls)
            tf_idf_score_ls = self.tf_idf.calculate_distances()

            if fuzzy_ratio_ls.count(max(fuzzy_ratio_ls)) > 1:
                fuzzy_best_index = self.random_chose_index(fuzzy_ratio_ls, max(fuzzy_ratio_ls))
            else:
                fuzzy_best_index = fuzzy_ratio_ls.index(max(fuzzy_ratio_ls))

            if tf_idf_score_ls.count(max(tf_idf_score_ls)) > 1:
                tftdf_best_index = self.random_chose_index(tf_idf_score_ls, max(tf_idf_score_ls))
            else:
                tftdf_best_index = tf_idf_score_ls.index(max(tf_idf_score_ls))

            fuzzy_best_content = context_ls[fuzzy_best_index][0].rstrip(self.punctuation_str)
            tfidf_best_content = context_ls[tftdf_best_index][0].rstrip(self.punctuation_str)
            if fuzzy_best_content == utterance or utterance in fuzzy_best_content:
                best_index = fuzzy_best_index
                # print(context_ls[best_index][0])
                return context_ls[best_index][1]

            if tfidf_best_content == utterance or utterance in tfidf_best_content:
                best_index = tftdf_best_index
                # print(context_ls[best_index][0])
                return context_ls[best_index][1]

            final_score_ls = [(fuzzy_ratio * self.fuzzy_weight + tf_tdf_score * self.tf_idf_weight) for fuzzy_ratio, tf_tdf_score in
                              zip(fuzzy_ratio_ls, tf_idf_score_ls)]
            # TODO: find a suitable weight
            # print(final_score_ls)
            if max(final_score_ls) < 0.85 and file_name != "weibo" and file_name != "domains": # TODO: ugly code
                # print(max(final_score_ls))
                answer = self.get_answer(utterance, "weibo")
                return answer
            else:
                # print(max(final_score_ls))
                max_score = max(final_score_ls)
                if final_score_ls.count(max_score) > 1:
                    best_index = self.random_chose_index(final_score_ls, max_score)
                else:
                    best_index = final_score_ls.index(max_score)
                # print(context_ls[best_index][0])
                return context_ls[best_index][1]
        except Exception as e:
            return "对不起亲,这个问题实在不晓得呀!"

    def get_answer2(self, utterance, file_name=None):
        try:
            utterance = utterance.rstrip(self.punctuation_str)
            file_name = self.get_utterance_type(utterance)
            # print(file_name)

            self.retrieval.read_indexes(file_name)
            context_ls = self.retrieval.search_sentences(utterance)
            # print(context_ls)
            if not context_ls:
                return "", 0

            fuzzy_ratio_ls = fuzzy_matching(utterance, context_ls)

            self.tf_idf.select_model(file_name)
            self.tf_idf.predict_tfidf(utterance, context_ls)
            tf_idf_score_ls = self.tf_idf.calculate_distances()

            if fuzzy_ratio_ls.count(max(fuzzy_ratio_ls)) > 1:
                fuzzy_best_index = self.random_chose_index(fuzzy_ratio_ls, max(fuzzy_ratio_ls))
            else:
                fuzzy_best_index = fuzzy_ratio_ls.index(max(fuzzy_ratio_ls))

            if tf_idf_score_ls.count(max(tf_idf_score_ls)) > 1:
                tftdf_best_index = self.random_chose_index(tf_idf_score_ls, max(tf_idf_score_ls))
            else:
                tftdf_best_index = tf_idf_score_ls.index(max(tf_idf_score_ls))

            fuzzy_best_content = context_ls[fuzzy_best_index][0].rstrip(self.punctuation_str)
            tfidf_best_content = context_ls[tftdf_best_index][0].rstrip(self.punctuation_str)
            if fuzzy_best_content == utterance or utterance in fuzzy_best_content:
                best_index = fuzzy_best_index
                # print(context_ls[best_index][0])
                return context_ls[best_index][1], max(fuzzy_ratio_ls)

            if tfidf_best_content == utterance or utterance in tfidf_best_content:
                best_index = tftdf_best_index
                # print(context_ls[best_index][0])
                return context_ls[best_index][1], max(tf_idf_score_ls)

            final_score_ls = [(fuzzy_ratio * self.fuzzy_weight + tf_tdf_score * self.tf_idf_weight) for fuzzy_ratio, tf_tdf_score in
                              zip(fuzzy_ratio_ls, tf_idf_score_ls)]
            # TODO: find a suitable weight

            max_score = max(final_score_ls)
            if final_score_ls.count(max_score) > 1:
                best_index = self.random_chose_index(final_score_ls, max_score)
            else:
                best_index = final_score_ls.index(max_score)
            # print(context_ls[best_index][0])
            return context_ls[best_index][1], max_score
        except Exception as e:
            return "", 0

    def test(self, utterance):
        answer = self.get_answer2(utterance)
        return answer

    def start(self):
        while True:
            utterance = input(">>>")
            if utterance.strip() == "exit1":
                break
            answer = self.get_answer2(utterance)
            print("<<<{}".format(answer))

    def api(self, utterance):
        answer, score = self.get_answer2(utterance)
        return [answer, score]