Exemplo n.º 1
0
    def __init__(self):

        # print(train_data)
        pp = preprocessing_data()
        train_data = pp.drop_col('Alley')
        train_data = pp.drop_col('PoolQC')
        train_data = pp.drop_col('Fence')
        train_data = pp.drop_col('MiscFeature')
        pp.fill_null_values()
        print(train_data.columns)
Exemplo n.º 2
0
def test_train_Multi(X,Y):
    '''    
    Input: X and Y dataframe after using data_generating class.
    X shape: (number of gate-net,49 unprocessed features)
    Y shape: (number of labels,)
    Split the data into train and test with split ratio 80/20 and
    using the preprocessing_data class to transform the data for each of test and train.
    And then, split gate and net data into seperate dataframes
    Return dataframes: gate_train,gate_test,net_train,net_test,Y_train,Y_test
    gate_train shape: (number of gate-net*0.8,17 processed features)
    gate_test shape:(number of gate-net*0.2,17 processed features)
    net_train shape:(number of gate-net*0.8,24 processed features)
    net_test shape:(number of gate-net*0.2,24 processed features)

    '''
    split_Y = int(len(Y)*0.8)
    split = split_Y*10
    X_train = X[:split ]
    X_test = X[split :]
    Y_train = Y[:split_Y]
    Y_test = Y[split_Y:]
    preprocess = preprocessing_data(X,0.8)
    X_train = preprocess.filter_columns(X_train)
    X_train = preprocess.normalise(X_train)
    X_train = preprocess.categorical(X_train)
    X_train = preprocess.location_transform(X_train)
    X_test = preprocess.filter_columns(X_test)
    X_test = preprocess.normalise(X_test)
    X_test = preprocess.categorical(X_test)
    X_test = preprocess.location_transform(X_test)
    X_test.index = [i for i in range(1,len(X_test)+1)]
    Y_test.index = [i for i in range(len(Y_test))]
    Y_train = generating_tensor_Y(Y_train)
    Y_test = generating_tensor_Y(Y_test)
    gate_train = pd.concat([X_train.iloc[:,:8],X_train.iloc[:,-9:]],axis=1)
    net_train = X_train.iloc[:,8:-9]
    gate_test =  pd.concat([X_test.iloc[:,:8],X_test.iloc[:,-9:]],axis=1)
    net_test = X_test.iloc[:,8:-9]
    return gate_train,gate_test,net_train,net_test,Y_train,Y_test
Exemplo n.º 3
0
def test_train(X,Y):
    '''
    Input: X and Y dataframe after using data_generating class.
    X shape: (number of gate-net,49 unprocessed features)
    Y shape: (number of labels,)
    Split the data into train and test with split ratio 80/20 and
    using the preprocessing_data class to transform the data for each of test and train data.
    Return X_train,Y_train,X_test,Y_test: The splitted train/test tensors
    X_train shape (number of gate-net*0.8,41 processed features))
    Y_train shape (number of labels*0.8,)
    X_test shape  (number of gate-net*0.2,41 processed features))
    Y_test shape (number of labels*0.2,)

    '''
    split_Y = int(len(Y)*0.8)
    split = split_Y*10
    X_train = X[:split]
    X_test = X[split :]
    Y_train = Y[:split_Y]
    Y_test = Y[split_Y:]
    preprocess = preprocessing_data(X,0.8)
    X_train = preprocess.filter_columns(X_train)
    X_train = preprocess.normalise(X_train)
    X_train = preprocess.categorical(X_train)
    X_train = preprocess.location_transform(X_train)
    X_test = preprocess.filter_columns(X_test)
    X_test = preprocess.normalise(X_test)
    X_test = preprocess.categorical(X_test)
    X_test = preprocess.location_transform(X_test)
    X_test.index = [i for i in range(1,len(X_test)+1)]
    Y_test.index = [i for i in range(len(Y_test))]
    X_train = generating_tensor_X(X_train)
    Y_train = generating_tensor_Y(Y_train)
    X_test = generating_tensor_X_test(X_test)
    Y_test = generating_tensor_Y(Y_test)
    return X_train,Y_train,X_test,Y_test
                        "2 - exit \n"

        option = int(input(menu_message))
        # data should be preprocessed and saved to disk
        # with this enabled, reading from disk will take time
        load_from_disk = False

        if option >= 0 or option <= 2:
            load_target_column()
            if option == 0:
                print('Classifying started!')
                if load_from_disk:
                    # TODO: fix this if someone wants to read data from disk
                    preprocessed_data = load_preprocessed_data_from_disk()
                else:
                    preprocessed_data = preprocessing_data(False) # return one bag of words
                initialize_data(preprocessed_data)
                classifying()
            elif option == 1:
                print('Ranking started!')
                preprocessed_data = preprocessing_data(True) # return two bag of words

                '''
                    set index 0, for testing purposing of each method increase this index and comment other preprocessing methods
                    because of we don't want to load all data in RAM
                '''
                start_ranking(preprocessed_data[0][0], preprocessed_data[0][1])

            correct_input = True
        else:
            print('Incorrect input')
Exemplo n.º 5
0
 # X.index = [i for i in range(len(X))]
 # X = X.drop(X.index[0])
 # X.to_pickle("Data.pkl")
 # Y = pd.DataFrame(Y_init)
 # Y.to_pickle("label724")
 ##### save the initial data ##############
 X = pd.read_pickle('Data.pkl')
 Y = pd.read_pickle("label724")
 ## train test split
 split_Y = int(len(Y) * 0.8)
 split = split_Y * 10
 X_train = X[:split]
 X_test = X[split:]
 Y_train = Y[:split_Y]
 Y_test = Y[split_Y:]
 preprocess = preprocessing_data(X, 0.8)
 X_train = preprocess.filter_columns(X_train)
 X_train = preprocess.normalise(X_train)
 X_train = preprocess.categorical(X_train)
 X_train = preprocess.location_transform(X_train)
 X_test = preprocess.filter_columns(X_test)
 X_test = preprocess.normalise(X_test)
 X_test = preprocess.categorical(X_test)
 X_test = preprocess.location_transform(X_test)
 X_test.index = [i for i in range(1, len(X_test) + 1)]
 Y_test.index = [i for i in range(len(Y_test))]
 SVM_X_Train = generating_tensor_X_SVM(X_train)
 SVM_X_Test = generating_tensor_X_SVM(X_test)
 Y_train = generating_tensor_Y(Y_train)
 Y_test = generating_tensor_Y(Y_test)
 parameters = {
Exemplo n.º 6
0
                        default="softmax",
                        help="activate function")
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help="display each epoch on training")
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    try:
        df = pd.read_csv(args.dataset, header=None)
    except Exception as e:
        sys.exit(print("{}: {}".format(type(e).__name__, e)))
    X_train, X_test, Y_train, Y_test = preprocessing_data(df, args.activate)
    if args.model == "train":
        num_iterations = 56000
        learning_rate = 0.007
        layers_dims = [X_train.shape[0], 40, 20, 10, 5, 1
                       ] if args.activate == "sigmoid" else [
                           X_train.shape[0], 40, 20, 10, 5, 2
                       ]
        parameters = L_layer_model(X_train,
                                   Y_train,
                                   X_test,
                                   Y_test,
                                   layers_dims,
                                   learning_rate,
                                   num_iterations,
                                   args.activate,
Exemplo n.º 7
0
def preprocessing():
    """ preprocess scraped data """
    preprocessing_data()
    return jsonify("news preprocessed succefuly"), 200
Exemplo n.º 8
0
import extraction
import preprocessing
import string_similarity
"""
The following code executes the contracting chain script and returns 
a CSV with the contracting chain for all interadministrative contracts of INVIAS
"""

#Extraction
df_entity_raw, df_names_raw = extraction.extracting_data()
# Preprocessing
df_entity_clean, names_mun_clean = preprocessing.preprocessing_data(
    df_entity_raw, df_names_raw)

test_names = [
    "HUILA - ALCALDÍA MUNICIPIO DE NEIVA",
    "SANTANDER - ALCALDÍA MUNICIPIO DE BUCARAMANGA",
    "VALLE DEL CAUCA - ALCALDÍA MUNICIPIO DE PALMIRA"
]
# test_names = names_mun_clean
# Chain construction
chain = string_similarity.contracting_chain(test_names, 3, df_entity_clean)

# Printing csv
chain.to_csv('contracting_chain.csv')
def preprocessing():
    preprocessing_data()
    return jsonify("Les nouvelles traitées !!"), 200