def _det_prior_features_filepath(self, manipulator_name, manipulations, project_settings): assert len(manipulations) != 0 manipulator_map = self.manipulator_map manipulator_name = self.manipulator_name m_order = manipulator_map[manipulator_name] tagger_module = importlib.import_module('feature.tagger') counter = m_order ord_man_map = flip_dict(manipulator_map) while counter > 0: cand_man_name = ord_man_map[counter - 1] init_man = list( filter(lambda x: list(x.keys())[0] == cand_man_name, manipulations) )[0][cand_man_name]['initialized_manipulator'] if issubclass(init_man.__class__, getattr(tagger_module, 'Tagger')): counter = counter - 1 else: prior_man_name = cand_man_name prior_manipulator_feature_names_filepath = self._det_output_features_filepath( prior_man_name) return prior_manipulator_feature_names_filepath prior_manipulator_feature_names_filepath = load_clean_input_file_filepath( project_settings, 'feature_names') return prior_manipulator_feature_names_filepath
def _store_indices_and_features(self, untouched_indices, touch_indices): self.untouched_indices = untouched_indices self.touch_indices = touch_indices feature_names_filepath = self.prior_manipulator_feature_names_filepath inv_column_map = load_inv_column_map(feature_names_filepath) column_map = flip_dict(inv_column_map) filtered_features = { k: v for k, v in column_map.items() if k in untouched_indices } self.features = filtered_features
def gen_output(self): clf = self.base_algorithm prior_feature_names_filepath = self.prior_manipulator_feature_names_filepath inv_col_map = load_inv_column_map(prior_feature_names_filepath) col_map = flip_dict(inv_col_map) num_cols = len(col_map) column_names = [col_map[i] for i in range(num_cols)] dot_data = tree.export_graphviz(clf, out_file=None, feature_names=column_names) graph = graphviz.Source(dot_data, format='png') artifact_dir = self.artifact_dir model_name = self.model_name graph.render(filename=artifact_dir + '/' + slugify(model_name) + '-tree')
def transform(self, X_touch, dataset_name): #Pandas dependent max_col = X_touch.idxmax(1) idx_val_map = self.idx_val_map num_column = max_col.apply(lambda x: idx_val_map[x]) if hasattr(self, 'val_map'): val_map = self.val_map inv_val_map = flip_dict(val_map) for val in inv_val_map: try: assert val in num_column.values except AssertionError: #print '\t' + str(val) + " not in column for dataset:" + dataset_name + ". Possibly Tweak val_map in as_numeric transform" pass return pd.DataFrame(num_column, index=X_touch.index)
def _det_prior_features_filepath(self, manipulator_name, manipulations, project_settings): if self.manipulator_name == 'final_algorithm': len_manipulations = len(manipulations) if len_manipulations == 0: prior_manipulator_feature_names_filepath = load_clean_input_file_filepath( project_settings, 'feature_names') else: manipulator_map = self.manipulator_map ord_manip_lookup = flip_dict(manipulator_map) pm_order = len_manipulations - 1 prior_manipulator_name = ord_manip_lookup[pm_order] prior_manipulator_feature_names_filepath = self._det_output_features_filepath( prior_manipulator_name) return prior_manipulator_feature_names_filepath else: return super(Wrapper, self)._det_prior_features_filepath( manipulator_name, manipulations, project_settings)
def transform(self, X_mat, y, dataset_name): #TODO: select according to final filter filters = self.filters log_prefix = dataset_name num_filters = len(filters) X_filt = X_mat if num_filters > 0: #print "\t["+log_prefix+"] Filtering selected features" first_filter = self._fetch_initialized_filter(filters[0]) final_filter = self._fetch_initialized_filter(filters[-1:][0]) prior_features_filepath = first_filter.prior_manipulator_feature_names_filepath orig_inv_column_map = load_inv_column_map(prior_features_filepath) final_filter_features = final_filter.features inv_column_map = flip_dict(final_filter_features) filtered_indices = list() for col in inv_column_map: filtered_indices.append(orig_inv_column_map[col]) sorted_filtered_indices = sorted(filtered_indices) X_filt = X_mat.loc[:, sorted_filtered_indices] X_filt.columns = final_filter.features.keys() assert True not in pd.isnull(X_filt).any( 1).value_counts() # TODO: pandas dependent return X_filt, y
def load_prior_features(self): prior_transform_feature_names_filepath = self.prior_manipulator_feature_names_filepath prior_inv_col_map = load_inv_column_map( prior_transform_feature_names_filepath) prior_features = flip_dict(prior_inv_col_map) return prior_features
if __name__ == "__main__": parser = argparse.ArgumentParser( description='Processing the raw terms of services dataset and filter them ' 'based on the selected topics, on section and paragraph levels.') parser.add_argument('--paragraph', help='If set, will separate the paragraphs', action='store_true', default=True) parser.add_argument('--input_folder', help='The location of the folder containing the raw data.', default="../resources/tos-data") parser.add_argument('--output_folder', help='The location for the output folder.', default="../resources/tos-data-cleaned") args = parser.parse_args() grouped_keys = flip_dict(key_dict) input_folder = args.input_folder output_folder = args.output_folder if not os.path.exists(output_folder): os.mkdir(output_folder) cleaned_counter = 0 for f in tqdm(sorted(os.listdir(input_folder))): with open(os.path.join(input_folder, f)) as fp: tos = json.load(fp) new_json = {"level1_headings": [], "level2_headings": []} temp_title0 = None temp_title1 = None text0 = "" text1 = ""