Exemplo n.º 1
0
    # LOAD DATA
    train_df = load_50_auth_data()
    # train_df = load_50_authors_preprocessed_data()
    referance_col = 'text'
    ngram = 1
    if len(sys.argv) > 1:
        # command line args
        arg_dict = command_line_args(argv=sys.argv)

        if "file" in (arg_dict.keys()):
            input_data_path = str(arg_dict.get('file')[0])
            print("reading from external data file:" + input_data_path)
            df_train = pd.read_csv(input_data_path)
        if "preprocess" in (arg_dict.keys()):
            df_train = preprocess_text(df_train)
            if str(arg_dict.get('preprocess')[0]) == 'POS':
                referance_col = 'text_pos_tag_pairs'
            elif str(arg_dict.get('preprocess')[0]) == 'ENT':
                referance_col = 'text_with_entities'
            elif str(arg_dict.get('preprocess')[0]) == 'CLN':
                referance_col = 'text_cleaned'
        if "ngram" in (arg_dict.keys()):
            ngram = int(arg_dict.get('ngram')[0])

    xtrain, xtest, ytrain, ytest = train_vali_split(train_df)
    xtrain = pd.DataFrame(xtrain[referance_col])
    xtrain = xtrain.rename(columns={referance_col: "text"})

    xtest = pd.DataFrame(xtest[referance_col])
    xtest = xtest.rename(columns={referance_col: "text"})
Exemplo n.º 2
0
 def transform(self, df):
     if not os.path.isfile(self.name):
         df = preprocess_text(df)
         df.to_pickle(self.name)
     return pd.read_pickle(self.name)