class Test_ClassSummary(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' self.input_module = InputFileCython(wiki_vector_file_name) def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() count = 0 class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if count == 0: class_summary = ClassSummaryCython( file.strip(), self.input_module.get_file_data(), wiki_vector) else: class_summary = ClassSummaryCython( file.strip(), self.input_module.get_file_data(), wiki_vector, class_word_vector, class_average_vector) class_word_vector, class_average_vector = class_summary.summary_class( ) fo = open( APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt", 'w') sys.stdout = fo print(class_average_vector[file.strip()]) fo.close() sys.stdout = sys.__stdout__ class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython( class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity( )
class Test_ClassSummary(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' self.input_module = InputFileCython(wiki_vector_file_name) def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() count = 0 class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if count == 0: class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector) else: class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector, class_word_vector, class_average_vector) class_word_vector, class_average_vector = class_summary.summary_class() fo = open(APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt", 'w') sys.stdout = fo print(class_average_vector[file.strip()]) fo.close() sys.stdout = sys.__stdout__ class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity()
class Test_ClassSummary(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' self.input_module = InputFileCython(wiki_vector_file_name) def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() read_file = '13996061-n.txt' class_summary = ClassSummary(read_file, wiki_vector) class_summary.summary_class()
class SqliteTwitterSummary(object): """ Twitter Save to the SQLite """ def __init__(self, class_word_vector): """ Initial Setting Get the mecab dict by the yaml """ Twitter = namedtuple("Twitter", ["mecab"]) config_file = "enviroment_twitter.yml" with open(config_file, encoding="utf-8") as cf: e = yaml.load(cf) twitter = Twitter(e["twitter"]["mecab"]) self.tagger = MeCab.Tagger("-Owakati -d %s" % twitter.mecab) conn = sqlite3.connect('./twitter_data.db') self.cur = conn.cursor() self.class_word_vector = class_word_vector self.class_average_vector = {} self.class_word_dict = self.make_class_word_dict() #self.__initial_setting_vector() def __initial_setting_vector(self): # Wiki vector dict wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt' self.input_module = InputFileCython(wiki_vector_file_name) self.input_module.input_fast_large_file() self.wiki_vector = self.input_module.get_vector() # Make average vector dict wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt' self.input_module = InputFileCython(wiki_average_vector_file_name_list) self.input_module.input_special_format_file() summary_vector_file_list = self.input_module.get_file_data() for file in summary_vector_file_list: read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file self.input_module = InputFileCython(read_file) self.input_module.input_file_str_list() summary_vector_file_list = self.input_module.get_file_data() class_name = file.replace("_summary.txt_vector.txt", "") if class_name not in self.class_average_vector: self.class_average_vector.update({class_name: summary_vector_file_list}) self.class_summary = ClassSummary("", self.wiki_vector, "") self.cosine_similarity = ClassCosineSimilarity("", "") def make_class_word_dict(self): """ make remake the data format """ word_class_dict = {} for class_name, word_list in self.class_word_vector.items(): word_dict = {} for word in word_list: if word not in word_dict: word_dict.update({word: 1}) if class_name not in word_class_dict: word_class_dict.update({class_name: word_dict}) return word_class_dict def call_sql(self): """ call SQlite and save the twitter in the SQLite """ self.cur.execute("""SELECT source_txt, replay_txt FROM ms_rinna;""") file_list = os.listdir("./data_latest/") #for file in file_list: # os.remove("./data/" + file) for source_txt, replay_txt in self.cur.fetchall(): class_name = self.judge_class(source_txt, replay_txt) # class_name = self.judge_class_wiki_vector(source_txt, replay_txt) print(class_name) print(source_txt) print(replay_txt) source_file = open("./data_latest/" + class_name + '_source_twitter_data.txt', 'a') replay_file = open("./data_latest/" + class_name + '_replay_twitter_data.txt', 'a') replay_file.write(self.tagger.parse(source_txt).replace("\n", "") + '\n') source_file.write(self.tagger.parse(replay_txt).replace('\n', '') + '\n') source_file.close() replay_file.close() def judge_class(self, source_txt, replay_txt=""): """ Judge word class :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) for class_name in self.class_word_vector.keys(): word_match_count = self.__match_word_count(total_text, class_name) if class_name not in class_match_rate: class_match_rate.update({class_name: 1.0 * word_match_count / len(self.class_word_dict[class_name])}) if max(class_match_rate.values()) == 0.0: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def judge_class_wiki_vector(self, source_txt, replay_txt=""): """ Judge word class by wiki vector :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) self.class_summary.summary_vector_word_list(total_text) summary_vector = self.class_summary.get_average_vector() for class_name, average_vector in self.class_average_vector.items(): class_match_rate.update({class_name: self.cosine_similarity.cosine_similarity(summary_vector, average_vector)}) print(class_match_rate) if max(class_match_rate.values()) <= 0.1: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def __mecab_method(self, text): """ Call Mecab method split process and choose noum :param text: :return: only noum """ res = self.tagger.parseToNode("".join(text)) split_nonum = [] while res: feature = res.feature.split(",") if feature[0] == u"名詞": split_nonum.append(feature[6]) res = res.next return split_nonum def __match_word_count(self, total_text, class_name): """ count matthing word word class :param total_text: source text and reply text :param class_name: choose class name :return: matthing count """ word_match_count = 0 for word in total_text: if word in self.class_word_dict[class_name]: word_match_count = word_match_count + 1 return word_match_count
This script for parallel command multi thread program """ if __name__ == '__main__': """ args -r: setting word net link list Example: '/../../Data/wn_summary_list.txt' """ parser = argparse.ArgumentParser() parser.add_argument('--read_word_net_list_file', '-r', default='', help='set word net list file') args = parser.parse_args() # Word Net File wn_summary_split_list = APP_ROOT + "/../../Data/" + args.read_word_net_list_file input_module = InputFileCython(wn_summary_split_list) input_module.input_special_format_file() file_list = input_module.get_file_data() # Wiki Vector wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' input_module = InputFileCython(wiki_vector_file_name) input_module.input_fast_large_file() wiki_vector = input_module.get_vector() producerConsumer = ProducerConsumerClassSummary() multi_thread_producer_crawl_instance = threading.Thread(target=producerConsumer.producer_run, args=([file_list])) multi_thread_consumer_crawl_instance = threading.Thread(target=producerConsumer.consumer_run, args=([wiki_vector])) multi_thread_producer_crawl_instance.start() multi_thread_consumer_crawl_instance.start()
if __name__ == '__main__': """ args -r: setting word net link list Example: '/../../Data/wn_summary_list.txt' """ parser = argparse.ArgumentParser() parser.add_argument('--read_word_net_list_file', '-r', default='', help='set word net list file') args = parser.parse_args() # Word Net File wn_summary_split_list = APP_ROOT + "/../../Data/" + args.read_word_net_list_file input_module = InputFileCython(wn_summary_split_list) input_module.input_special_format_file() file_list = input_module.get_file_data() # Wiki Vector wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' input_module = InputFileCython(wiki_vector_file_name) input_module.input_fast_large_file() wiki_vector = input_module.get_vector() producerConsumer = ProducerConsumerClassSummary() multi_thread_producer_crawl_instance = threading.Thread( target=producerConsumer.producer_run, args=([file_list])) multi_thread_consumer_crawl_instance = threading.Thread( target=producerConsumer.consumer_run, args=([wiki_vector])) multi_thread_producer_crawl_instance.start() multi_thread_consumer_crawl_instance.start()
class SqliteTwitterSummary(object): """ Twitter Save to the SQLite """ def __init__(self, class_word_vector): """ Initial Setting Get the mecab dict by the yaml """ Twitter = namedtuple("Twitter", ["mecab"]) config_file = "enviroment_twitter.yml" with open(config_file, encoding="utf-8") as cf: e = yaml.load(cf) twitter = Twitter(e["twitter"]["mecab"]) self.tagger = MeCab.Tagger("-Owakati -d %s" % twitter.mecab) conn = sqlite3.connect('./twitter_data.db') self.cur = conn.cursor() self.class_word_vector = class_word_vector self.class_average_vector = {} self.class_word_dict = self.make_class_word_dict() #self.__initial_setting_vector() def __initial_setting_vector(self): # Wiki vector dict wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt' self.input_module = InputFileCython(wiki_vector_file_name) self.input_module.input_fast_large_file() self.wiki_vector = self.input_module.get_vector() # Make average vector dict wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt' self.input_module = InputFileCython(wiki_average_vector_file_name_list) self.input_module.input_special_format_file() summary_vector_file_list = self.input_module.get_file_data() for file in summary_vector_file_list: read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file self.input_module = InputFileCython(read_file) self.input_module.input_file_str_list() summary_vector_file_list = self.input_module.get_file_data() class_name = file.replace("_summary.txt_vector.txt", "") if class_name not in self.class_average_vector: self.class_average_vector.update( {class_name: summary_vector_file_list}) self.class_summary = ClassSummary("", self.wiki_vector, "") self.cosine_similarity = ClassCosineSimilarity("", "") def make_class_word_dict(self): """ make remake the data format """ word_class_dict = {} for class_name, word_list in self.class_word_vector.items(): word_dict = {} for word in word_list: if word not in word_dict: word_dict.update({word: 1}) if class_name not in word_class_dict: word_class_dict.update({class_name: word_dict}) return word_class_dict def call_sql(self): """ call SQlite and save the twitter in the SQLite """ self.cur.execute("""SELECT source_txt, replay_txt FROM ms_rinna;""") file_list = os.listdir("./data_latest/") #for file in file_list: # os.remove("./data/" + file) for source_txt, replay_txt in self.cur.fetchall(): class_name = self.judge_class(source_txt, replay_txt) # class_name = self.judge_class_wiki_vector(source_txt, replay_txt) print(class_name) print(source_txt) print(replay_txt) source_file = open( "./data_latest/" + class_name + '_source_twitter_data.txt', 'a') replay_file = open( "./data_latest/" + class_name + '_replay_twitter_data.txt', 'a') replay_file.write( self.tagger.parse(source_txt).replace("\n", "") + '\n') source_file.write( self.tagger.parse(replay_txt).replace('\n', '') + '\n') source_file.close() replay_file.close() def judge_class(self, source_txt, replay_txt=""): """ Judge word class :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) for class_name in self.class_word_vector.keys(): word_match_count = self.__match_word_count(total_text, class_name) if class_name not in class_match_rate: class_match_rate.update({ class_name: 1.0 * word_match_count / len(self.class_word_dict[class_name]) }) if max(class_match_rate.values()) == 0.0: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def judge_class_wiki_vector(self, source_txt, replay_txt=""): """ Judge word class by wiki vector :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) self.class_summary.summary_vector_word_list(total_text) summary_vector = self.class_summary.get_average_vector() for class_name, average_vector in self.class_average_vector.items(): class_match_rate.update({ class_name: self.cosine_similarity.cosine_similarity( summary_vector, average_vector) }) print(class_match_rate) if max(class_match_rate.values()) <= 0.1: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def __mecab_method(self, text): """ Call Mecab method split process and choose noum :param text: :return: only noum """ res = self.tagger.parseToNode("".join(text)) split_nonum = [] while res: feature = res.feature.split(",") if feature[0] == u"名詞": split_nonum.append(feature[6]) res = res.next return split_nonum def __match_word_count(self, total_text, class_name): """ count matthing word word class :param total_text: source text and reply text :param class_name: choose class name :return: matthing count """ word_match_count = 0 for word in total_text: if word in self.class_word_dict[class_name]: word_match_count = word_match_count + 1 return word_match_count