Exemplo n.º 1
0
 def __init__(self, label_file_path):
     self.global_config = StaticConfig()
     self.label =self.path_to_numpy_array(label_file_path)
     self.existing_predicts = []
     self.model = FeedforwardEnsemblingModel()
 def __init__(self):
     # self._model = None
     self.global_config = StaticConfig()
     self.dynamic_config = DynamicConfig()
 def __init__(self):
     self.global_config = StaticConfig()
     self.dynamic_config = DynamicConfig()
Exemplo n.º 4
0
 def __init__(self, tokenizer=None):
     self.global_config = StaticConfig()
     self.tokenizer = tokenizer
Exemplo n.º 5
0
 def __init__(self):
     self.x_test = None
     self.global_config = StaticConfig()
     self.preprocessor = None
Exemplo n.º 6
0
    def prepare_data_folder(self,
                            train_input_path,
                            output_folder_path,
                            train_test_factor=0.9,
                            debug_factor=1.0):
        '''
        This method will take the train data and then divide it into train and test sets by a factor of train_test_factor.
        Then the train dataset is splitted according the column name first.
        The folders under splitted_output_folder splitted by the label names:
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
        Then the folder has static_config.preprocess_splits splitted slices of data. Each data has all of the label=1 from
        the above label of the training data set along with equal number of label=0 data rows from the training dataset as well.

        The test data is output to the test data folder for validation.

        :param train_input_path: file path for original training
        :param splitted_output_folder: output folder
        :param train_test_factor: ratio to split train and test
        :param debug_factor: if run with this config, only sample a small proportion of the raw data. 1.0 means no debug
        :return:
        '''

        print(
            "##################### preprocessor starts ########################"
        )
        global_config = StaticConfig()
        create_folder(output_folder_path)
        label_cols = [
            'id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
            'insult', 'identity_hate'
        ]
        raw_data = pd.read_csv(train_input_path)
        raw_data = raw_data.sample(frac=1)
        self.tokenizer = text.Tokenizer(
            num_words=self.global_config.max_features)
        list_sentences_raw_data = raw_data["comment_text"].fillna(
            "CVxTz").values
        self.tokenizer.fit_on_texts(list(list_sentences_raw_data))
        pickle.dump(
            self.tokenizer,
            open(
                '{}/{}'.format(output_folder_path,
                               self.global_config.tokenizer_save_name), "wb"))

        train_data_size = int(raw_data.shape[0] * train_test_factor *
                              debug_factor)
        train = pd.DataFrame(raw_data[:train_data_size], columns=label_cols)

        if self.global_config.use_raw_for_test:
            test = pd.DataFrame(raw_data)
        else:
            test = pd.DataFrame(raw_data[train_data_size:] if debug_factor ==
                                1.0 else raw_data[-100:])
        test_name = "{}/{}".format(output_folder_path, "test.csv")
        test.to_csv(test_name)

        for label_name in global_config.model_names:
            label_output = output_folder_path + "/" + label_name
            create_folder(label_output)
            sub_train_output_file_path = '{}/tr_train_{}.csv'.format(
                label_output, label_name)
            train.to_csv(sub_train_output_file_path)  # , index=False)
            print('output train for No. {} subset to file '.format(
                label_name, sub_train_output_file_path))
Exemplo n.º 7
0
        train_data_size = int(raw_data.shape[0] * train_test_factor *
                              debug_factor)
        train = pd.DataFrame(raw_data[:train_data_size], columns=label_cols)

        if self.global_config.use_raw_for_test:
            test = pd.DataFrame(raw_data)
        else:
            test = pd.DataFrame(raw_data[train_data_size:] if debug_factor ==
                                1.0 else raw_data[-100:])
        test_name = "{}/{}".format(output_folder_path, "test.csv")
        test.to_csv(test_name)

        for label_name in global_config.model_names:
            label_output = output_folder_path + "/" + label_name
            create_folder(label_output)
            sub_train_output_file_path = '{}/tr_train_{}.csv'.format(
                label_output, label_name)
            train.to_csv(sub_train_output_file_path)  # , index=False)
            print('output train for No. {} subset to file '.format(
                label_name, sub_train_output_file_path))


if __name__ == "__main__":
    wrapper = SeqProcessor()
    wrapper.prepare_data_folder(
        './input/train.csv',
        './preprocessing_wrapper_demo_output',
        train_test_factor=StaticConfig().train_test_factor,
        debug_factor=1.0)
Exemplo n.º 8
0
 def __init__(self, label_file_path):
     self.global_config = StaticConfig()
     self.label = pd.read_csv(label_file_path)
Exemplo n.º 9
0
    def __init__(self):
        self.data_sets = []

        self.global_config = StaticConfig()
        self.preprocessor = None
Exemplo n.º 10
0
 def __init__(self):
     self.global_config = StaticConfig()
     self.predictor = Predictor()