Пример #1
0
 def _det_prior_features_filepath(self, manipulator_name, manipulations,
                                  project_settings):
     assert len(manipulations) != 0
     manipulator_map = self.manipulator_map
     manipulator_name = self.manipulator_name
     m_order = manipulator_map[manipulator_name]
     tagger_module = importlib.import_module('feature.tagger')
     counter = m_order
     ord_man_map = flip_dict(manipulator_map)
     while counter > 0:
         cand_man_name = ord_man_map[counter - 1]
         init_man = list(
             filter(lambda x: list(x.keys())[0] == cand_man_name,
                    manipulations)
         )[0][cand_man_name]['initialized_manipulator']
         if issubclass(init_man.__class__, getattr(tagger_module,
                                                   'Tagger')):
             counter = counter - 1
         else:
             prior_man_name = cand_man_name
             prior_manipulator_feature_names_filepath = self._det_output_features_filepath(
                 prior_man_name)
             return prior_manipulator_feature_names_filepath
     prior_manipulator_feature_names_filepath = load_clean_input_file_filepath(
         project_settings, 'feature_names')
     return prior_manipulator_feature_names_filepath
Пример #2
0
 def _store_indices_and_features(self, untouched_indices, touch_indices):
     self.untouched_indices = untouched_indices
     self.touch_indices = touch_indices
     feature_names_filepath = self.prior_manipulator_feature_names_filepath
     inv_column_map = load_inv_column_map(feature_names_filepath)
     column_map = flip_dict(inv_column_map)
     filtered_features = {
         k: v
         for k, v in column_map.items() if k in untouched_indices
     }
     self.features = filtered_features
Пример #3
0
 def gen_output(self):
     clf = self.base_algorithm
     prior_feature_names_filepath = self.prior_manipulator_feature_names_filepath
     inv_col_map = load_inv_column_map(prior_feature_names_filepath)
     col_map = flip_dict(inv_col_map)
     num_cols = len(col_map)
     column_names = [col_map[i] for i in range(num_cols)]
     dot_data = tree.export_graphviz(clf,
                                     out_file=None,
                                     feature_names=column_names)
     graph = graphviz.Source(dot_data, format='png')
     artifact_dir = self.artifact_dir
     model_name = self.model_name
     graph.render(filename=artifact_dir + '/' + slugify(model_name) +
                  '-tree')
Пример #4
0
 def transform(self, X_touch, dataset_name):
     #Pandas dependent
     max_col = X_touch.idxmax(1)
     idx_val_map = self.idx_val_map
     num_column = max_col.apply(lambda x: idx_val_map[x])
     if hasattr(self, 'val_map'):
         val_map = self.val_map
         inv_val_map = flip_dict(val_map)
         for val in inv_val_map:
             try:
                 assert val in num_column.values
             except AssertionError:
                 #print '\t' + str(val) + " not in column for dataset:" + dataset_name + ". Possibly Tweak val_map in as_numeric transform"
                 pass
     return pd.DataFrame(num_column, index=X_touch.index)
Пример #5
0
 def _det_prior_features_filepath(self, manipulator_name, manipulations,
                                  project_settings):
     if self.manipulator_name == 'final_algorithm':
         len_manipulations = len(manipulations)
         if len_manipulations == 0:
             prior_manipulator_feature_names_filepath = load_clean_input_file_filepath(
                 project_settings, 'feature_names')
         else:
             manipulator_map = self.manipulator_map
             ord_manip_lookup = flip_dict(manipulator_map)
             pm_order = len_manipulations - 1
             prior_manipulator_name = ord_manip_lookup[pm_order]
             prior_manipulator_feature_names_filepath = self._det_output_features_filepath(
                 prior_manipulator_name)
         return prior_manipulator_feature_names_filepath
     else:
         return super(Wrapper, self)._det_prior_features_filepath(
             manipulator_name, manipulations, project_settings)
Пример #6
0
 def transform(self, X_mat, y, dataset_name):
     #TODO: select according to final filter
     filters = self.filters
     log_prefix = dataset_name
     num_filters = len(filters)
     X_filt = X_mat
     if num_filters > 0:
         #print "\t["+log_prefix+"] Filtering selected features"
         first_filter = self._fetch_initialized_filter(filters[0])
         final_filter = self._fetch_initialized_filter(filters[-1:][0])
         prior_features_filepath = first_filter.prior_manipulator_feature_names_filepath
         orig_inv_column_map = load_inv_column_map(prior_features_filepath)
         final_filter_features = final_filter.features
         inv_column_map = flip_dict(final_filter_features)
         filtered_indices = list()
         for col in inv_column_map:
             filtered_indices.append(orig_inv_column_map[col])
         sorted_filtered_indices = sorted(filtered_indices)
         X_filt = X_mat.loc[:, sorted_filtered_indices]
         X_filt.columns = final_filter.features.keys()
     assert True not in pd.isnull(X_filt).any(
         1).value_counts()  # TODO: pandas dependent
     return X_filt, y
Пример #7
0
 def load_prior_features(self):
     prior_transform_feature_names_filepath = self.prior_manipulator_feature_names_filepath
     prior_inv_col_map = load_inv_column_map(
         prior_transform_feature_names_filepath)
     prior_features = flip_dict(prior_inv_col_map)
     return prior_features
Пример #8
0
if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description='Processing the raw terms of services dataset and filter them '
                    'based on the selected topics, on section and paragraph levels.')
    parser.add_argument('--paragraph', help='If set, will separate the paragraphs',
                        action='store_true', default=True)
    parser.add_argument('--input_folder',
                        help='The location of the folder containing the raw data.',
                        default="../resources/tos-data")
    parser.add_argument('--output_folder', help='The location for the output folder.',
                        default="../resources/tos-data-cleaned")

    args = parser.parse_args()

    grouped_keys = flip_dict(key_dict)

    input_folder = args.input_folder
    output_folder = args.output_folder
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)

    cleaned_counter = 0
    for f in tqdm(sorted(os.listdir(input_folder))):
        with open(os.path.join(input_folder, f)) as fp:
            tos = json.load(fp)
            new_json = {"level1_headings": [], "level2_headings": []}
            temp_title0 = None
            temp_title1 = None
            text0 = ""
            text1 = ""