def _obtain_candidate_keywords(list_all_dialogs, candi_kw_path, min_kw_freq=1, load_file_if_exists=True): r"""Obtain and save the candidate keywords used for extracting keywords. Inputs: list_all_dialogs, candi_kw_path, load_file_if_exists # TODO - **list_all_dialogs**: - **candi_kw_path**: - **load_file_if_exists**: Outputs: candi_keywords - **candi_keywords**: a 'list' containing all the candidate keywords """ if load_file_if_exists: if os.path.isfile(candi_kw_path): with open(candi_kw_path,'r') as f: candi_keywords = [kw.strip() for kw in f.readlines()] print('Loading candidate keywords from {}'.format(candi_kw_path)) print('Total candidate keywords count: ', len(candi_keywords)) return candi_keywords if not list_all_dialogs: raise Exception('no dialogs provided for obtaining candidate keywords') candi_kw_dir = os.path.dirname(candi_kw_path) if not os.path.exists(candi_kw_dir): os.makedirs(candi_kw_dir) print('Obtaining candidate keywords...') # initialization candi_keywords = [] kw_counter = collections.Counter() kw_extractor = KeywordExtractor() # extract possible keywords for dialog in tqdm(list_all_dialogs): for utterance in dialog: cur_keywords = kw_extractor.candi_extract(utterance) kw_counter.update(cur_keywords) candi_keywords.extend(cur_keywords) # delete the keywords occurring less than specified times (indicated by 'min_kw_freq'). rare_keywords = [kw for kw, freq in kw_counter.most_common() if freq < min_kw_freq] candi_keywords = [kw for kw, freq in kw_counter.most_common() if freq >= min_kw_freq] # delete keywords containing only one single letter single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2] candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2] # print the information of candidate keywords print('rare keywords count: ', len(rare_keywords)) print('single letter keywords count: ', len(single_letter_keywords)) print('total candidate keywords count(before cleaning): ', len(kw_counter.items())) print('total candidate keywords count(after cleaning): ', len(candi_keywords)) print('Saving candi_keywords into {}...'.format(candi_kw_path)) with open(candi_kw_path,'w') as f: for keyword in candi_keywords: f.write(keyword + '\n') return candi_keywords
def _obtain_candidate_keywords(self, load_file_if_exists=True): r"""Obtains and saves the candidate keywords used for extracting keywords. Args: load_file_if_exists: A 'bool' indicating whether load candi_keywords file if it exists. Returns: candi_keywords: A 'list' containing all the candidate keywords. """ if load_file_if_exists: candi_keywords_name = '../data/{}/candi_keywords.txt'.format(self.output_data_dir) if os.path.isfile(candi_keywords_name): with open(candi_keywords_name,'r') as f: candi_keywords = [kw.strip() for kw in f.readlines()] print('Loading candidate keywords from {}'.format(candi_keywords_name)) print('Total candidate keywords count: ', len(candi_keywords)) return candi_keywords print('Obtaining candidate keywords...') # Initialization candi_keywords = [] kw_counter = collections.Counter() kw_extractor = KeywordExtractor() # Extracts possible keywords. for dialog in tqdm(self.list_all_dialogs): for utterance in dialog: cur_keywords = kw_extractor.candi_extract(utterance) kw_counter.update(cur_keywords) candi_keywords.extend(cur_keywords) # Deletes the keywords occurring less than specified times rare_keywords = [kw for kw, freq in kw_counter.most_common() if freq < self.min_kw_freq] candi_keywords = [kw for kw, freq in kw_counter.most_common() if freq >= self.min_kw_freq] # Deletes keywords containing only one single letter single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2] candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2] # Writes candidate keywords into file candidate_keywords_output_path = '../data/{}/candi_keywords.txt'.format( self.output_data_dir) with open(candidate_keywords_output_path,'w') as f: for keyword in candi_keywords: f.write(keyword + '\n') return candi_keywords