def save_corpus_one_extracted_feature_by_name(self,
                                               feature_name,
                                               left=0,
                                               right=0):
     filenames = tp.get_filenames_from_dir(self._file_dir)
     for filename in filenames:
         save_extracted_feature_by_name(self._file_dir + filename,
                                        feature_name, left, right)
 def save_corpus_one_extracted_feature_by_regex(self,
                                                regex,
                                                left=0,
                                                right=0,
                                                feature_name=None):
     filenames = tp.get_filenames_from_dir(self._file_dir)
     for filename in filenames:
         save_extracted_feature_by_regex(self._file_dir + filename, regex,
                                         left, right, feature_name)
 def save_corpus_extracted_features(self, left=0, right=0):
     other_feature_patterns = [v for v in fs.FEATURE_DICT.values()]
     filenames = tp.get_filenames_from_dir(self._file_dir)
     for filename in filenames:
         raw_text = str(open(self._file_dir + filename, 'rb').read())
         tagged_text = tp.get_modified_pos_tagged_text(raw_text)
         for pattern in other_feature_patterns:
             results = fs.feature_finder(pattern, tagged_text)
             feature_name = get_feature_name_by_regex(pattern)
             file_p = self._file_dir + filename
             save_extracted_feature_by_res(file_p, results, feature_name,
                                           tagged_text, left, right)
     print('The extracted features of the corpus saved!')
 def corpus_feature_fre_extraction(self,
                                   normalized_rate=100,
                                   save_tagged_corpus=True,
                                   save_extracted_features=True,
                                   left=0,
                                   right=0):
     freq_data = [[
         'Filename', 'Words', 'Mean word length', 'Type-token ratio'
     ]]
     feature_names = [k for k in fs.FEATURE_DICT.keys()]
     freq_data[0].extend(feature_names)
     filenames = tp.get_filenames_from_dir(self._file_dir)
     for filename in filenames:
         sub_data = get_single_file_feature_fre(self._file_dir + filename,
                                                normalized_rate,
                                                save_tagged_corpus,
                                                save_extracted_features,
                                                left, right)
         freq_data.append(sub_data)
     pd.DataFrame(freq_data).to_excel('Feature_Fre_Extracted.xlsx')
     return freq_data
 def save_cleaned_corpus(self):
     filenames = tp.get_filenames_from_dir(self._file_dir)
     for filename in filenames:
         save_single_cleaned_text(self._file_dir + filename)
     print('Cleaned corpus saved!')
 def get_filepath_list(self):
     filenames = tp.get_filenames_from_dir(self._file_dir)
     path_list = []
     for filename in filenames:
         path_list.append(self._file_dir + filename)
     return '\n'.join(path_list)