def _find(self, column, threshold): # 记录所有可能出现的feature,以供后边统计字典中该feature出现的次数 # find all the feature , prepare for calculating the count that the feature appears column_list = self.raw_data[:, column] pre_dict = {} end_dict = {} start = time.clock() for i in range(self.row_number): for j in range(i + 1, self.row_number): pre_pattern = PatternHelper.find_pre_common_str( column_list[i], column_list[j]) end_pattern = PatternHelper.find_end_common_str( column_list[i], column_list[j]) if pre_pattern != '': DictHelper.increase_dic_key(pre_dict, pre_pattern) self.cell_pre_patterns[i][column].append(pre_pattern) self.cell_pre_patterns[j][column].append(pre_pattern) if end_pattern != '': DictHelper.increase_dic_key(end_dict, end_pattern) self.cell_end_patterns[i][column].append(end_pattern) self.cell_end_patterns[j][column].append(end_pattern) print("find1 : {0}".format(time.clock() - start)) pre_list = [ key for key, value in pre_dict.items() if value > threshold ] end_list = [ key for key, value in end_dict.items() if value > threshold ] return pre_list, end_list
def score_column_candidate(self, column, recover_list, small_pattern_list): score_dict = {} for candidate in recover_list[column]: candidate_small_pattern = self.train.get_small_pattern( candidate, column) for j in range(self.column_number_test): if len(recover_list[j]) == 1 and self.train.vote_for_column( column, candidate_small_pattern, j, small_pattern_list[j]): # can be a judge DictHelper.increase_dic_key(score_dict, candidate) return score_dict
def merge_dict(): profile_dict_list = StoreHelper.load_data( '../resource/convert_profile.dat', []) merged_list = [] for profile_dict in profile_dict_list: merged_dict = {} for feature in profile_dict: for key in profile_dict[feature]: DictHelper.increase_dic_key(merged_dict, key) merged_list.append(merged_dict) StoreHelper.store_data(merged_list, '../resource/merged_profile.dat') StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
def generate_probability_dict(file_content_list): # statistics single word and continue two words single_word_dict = {} two_word_dict = {} for file_content in file_content_list: for line in file_content.splitlines(): word_list = SegmentHelper.segment_text(line) if len(word_list) == 1: DictHelper.increase_dic_key(single_word_dict, word_list[0]) else: for i in range(len(word_list) - 1): DictHelper.increase_dic_key(single_word_dict, word_list[i]) DictHelper.increase_dic_key( two_word_dict, "%s %s" % (word_list[i], word_list[i + 1])) DictHelper.increase_dic_key(single_word_dict, word_list[-1]) # compute two word probability prob_a_b_dict = {} for words, count in two_word_dict.items(): word_a, word_b = words.split(' ') pro_a_b = two_word_dict[words] * 1.0 / single_word_dict[word_b] pro_b_a = two_word_dict[words] * 1.0 / single_word_dict[word_a] prob_a_b_dict[words] = max(pro_a_b, pro_b_a) return prob_a_b_dict
def get_combine_company_dict(store_data_file): company_dict = {} for tab in range(2): header, raw_data = ExcelHelper.read_excel('../resource/us_list_company2.xlsx', tab) row, column = raw_data.shape for i in range(row): company_name = SegmentHelper.normalize(str(raw_data[i][0]).strip()) if len(company_name) > 0: DictHelper.increase_dic_key(company_dict, raw_data[i][0]) df = pd.read_csv('../resource/us_list_company_1.csv') name_serial = df['Name'] for i in range(df.shape[0]): company_name = SegmentHelper.normalize(name_serial[i]) if len(company_name) > 0: DictHelper.increase_dic_key(company_dict, name_serial[i]) StoreHelper.store_data(company_dict, store_data_file)
def _add_and_remove(self, words_dict): for words, count in words_dict.items(): if words in self.phrase_dict: if self.phrase_dict[words] < count: DictHelper.increase_dic_key( self.phrase_dict, words, count - self.phrase_dict[words]) self._count_down_single_word( words, count - self.phrase_dict[words]) elif self.phrase_dict[words] > count: print( "Warning: phrase match times little than origin split: %s" % words) else: DictHelper.increase_dic_key(self.phrase_dict, words, count) self._count_down_single_word(words, count)
def get_dict_pattern(context, _dict, convert=True): match_result = {} for key in _dict.keys(): key_split = key.split(' ') if len(key_split) >= 3 and key_split[1] == '...': match_times = len( re.findall( re.escape(key_split[0]) + r'( \w+){0,5} ' + re.escape(' '.join(key_split[2:])), context)) else: key = key.strip() match_times = len( re.findall(r'\b' + re.escape(key) + r'\b', context)) if match_times > 0: if convert is True and type(_dict[key]) is not int: DictHelper.increase_dic_key(match_result, _dict[key], match_times) else: DictHelper.increase_dic_key(match_result, key, match_times) return match_result
def convert(self, skill_dict, discipline_dict, education_dict, responsibility_dict, year_convert_file): year_phase_dict = self._get_working_year_words(year_convert_file) skill_phase_dict = self._get_skill_words(skill_dict) discipline_phase_dict = self._get_discipline_words(discipline_dict) education_phase_dict = self._get_education_words(education_dict) responsibility_phase_dict = self._get_responsibility_words( responsibility_dict) self._add_and_remove(year_phase_dict) self._add_and_remove(skill_phase_dict) self._add_and_remove(discipline_phase_dict) self._add_and_remove(education_phase_dict) self._add_and_remove(responsibility_phase_dict) for word in self.new_words_list: DictHelper.increase_dic_key(self.phrase_dict, word) result_dict = { "education": education_phase_dict.keys(), "major": discipline_phase_dict.keys(), "skills": skill_phase_dict.keys(), "working-year": year_phase_dict.keys(), "responsibility": responsibility_phase_dict.keys() } return result_dict
def _get_full_relation(self, column1, column2, row): # for pre pattern cell1_patterns = self.cell_pre_patterns[row][column1] cell2_patterns = self.cell_pre_patterns[row][column2] for pattern1 in cell1_patterns: for pattern2 in cell2_patterns: DictHelper.increase_dic_key( self.pre_pattern_relation[column1][column2], pattern1 + "|" + pattern2) DictHelper.increase_dic_key( self.pre_pattern_relation[column2][column1], pattern2 + "|" + pattern1) # for end pattern cell1_patterns = self.cell_end_patterns[row][column1] cell2_patterns = self.cell_end_patterns[row][column2] for pattern1 in cell1_patterns: for pattern2 in cell2_patterns: DictHelper.increase_dic_key( self.end_pattern_relation[column1][column2], pattern1 + "|" + pattern2) DictHelper.increase_dic_key( self.end_pattern_relation[column2][column1], pattern2 + "|" + pattern1)
def _collect_words_dict(self): result_dict = {} for _dict in self.blob_dict_list: for key in _dict.keys(): DictHelper.increase_dic_key(result_dict, key) return result_dict