def main(self): """ 复杂应用才需要Override这部分 :return: """ self.get_contents() self.get_keywords() self.count_list = [0] * len(self.keywords_list) for index in range(len(self.raw_list)): string = self.raw_list[index] self.current_string = to_unicode(string[self.data_column_index]) self.sources_finder() try: if self.current_result: self.clean_index.append(index) self.cleaned_list.append(string) else: self.trash_index.append(index) self.trash_list.append(string) except: pass # 在多数据文件或者多词库文件进行批量处理的时候需要对这些数据进行重置 # self.raw_list = None # self.keywords_list = None return
def main(self): """ 复杂应用才需要Override这部分 :return: """ self.get_contents() self.get_keywords() self.count_list = [0] * len(self.keywords_list) for index in range(len(self.raw_list)): string = self.raw_list[index] self.current_string = to_unicode(string[self.data_column_index]) hit = False # for keywords in self.keywords_list: try: for keywords_index in range(0, len(self.keywords_list)): keywords = self.keywords_list[keywords_index] self.current_keywords = keywords self.keywords_finder() if self.current_result: self.trash_list.append(string) self.result_list.append(self.current_result) hit = True self.count_list[keywords_index] += 1 if self.one_hit_strategy: break else: continue else: continue if hit: self.trash_index.append(index) continue else: self.clean_index.append(index) self.cleaned_list.append(string) except: pass if self.show_process: total_length = float(len(self.raw_list)) keyword_count = 0 for count in self.count_list: keyword_count += count print(u'關鍵詞標記微博數量為 ' + str(keyword_count) + u' 占' + str(keyword_count / total_length * 100) + '%') print(u"{0}以下是關鍵詞標記的水贴{0}".format(u"-" * 30)) for count_index in range(len(self.count_list)): print(u'关键词 "' + self.keywords_list[count_index] + u'" 匹配的微博数量为 ' \ + str(self.count_list[count_index]) + u' 占' + \ str(self.count_list[count_index] / total_length * 100) + '%') # 在多数据文件或者多词库文件进行批量处理的时候需要对这些数据进行重置 # self.raw_list = None # self.keywords_list = None return