Python loadData 예제들, preprocessing.loadData Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: tia-e/text-classification-clustering

def main(num_clusters=6, dfFileName="../data/EFDataFrame_sample=0.01.pk"):
    logger.info('Start')

    currentFile = os.path.abspath(os.path.dirname(__file__))
    dfFilePath = os.path.join(currentFile, dfFileName)
    label = 'group'

    #Load the data at the given fileName location for the given
    logger.info('load data')
    efdata = loadData(dfFilePath)
    logger.info('Number of element %d', efdata.shape[0])

    #Extract Features
    logger.info('extract features')
    (similarity_matrix, tfidf_matrix) = get_similarity_matrix(efdata['text'])
    #_____________________________________________________________________
    #______________________________KMEANS_________________________________
    #peform KMEANS

    logger.info('------ K-means : %d--------', num_clusters)
    titles = efdata[label]
    km_clusters = get_cluster_kmeans(tfidf_matrix, num_clusters, titles)
    #
    logger.info('------ Dimensions reduction --------')
    x_pos, y_pos = pca_reduction(similarity_matrix, 10)

    #res = efdata.set_index('topic_id')['topic'].to_dict()
    #res2 = efdata.set_index('level')['topic'].to_dict()
    logger.info('plot')
    figName = '../figure/clustering_experiment_%s_isSample=True.pdf' % (label)
    figFilePath = os.path.join(currentFileDir, figName)
    scatter_clusters(x_pos, y_pos, km_clusters, titles,
                     figFilePath)  # Scatter K-means with PCA
    logger.info('End')

예제 #2

0

파일 보기

stocks = ['FB', 'TSLA', 'BAC']

# To download new Data
#data = yp.securityData(stocks, end='2010-01-01', save=True, epoch=False)
"""
2)  Next we want to turn that data into our deisred format
    There are many different ways to do this, to each their own
    Here we will be using basic python data structures
        (pandas is best practice and runs faster, but can be kindof confusing if not in the dataframe headspace)

    The end goal, is chunks of 10 day increments of closing stock price, to predict if tomorrow will go up or down
"""

# To read in existing - can also use pd.read_csv() for each indiv csv.
#data = yp.readExisting(stocks, end='2011-01-01')
data = bk.loadData(stocks)


def visualize_data(data):
    #print(type(data)) # dictionary
    #print(data.keys()) # see keys = ['BAC', 'FB', 'TSLA']

    for stock in data.keys():
        #print(type(data[stock])) # see all data is in the format of list
        #print(data[stock]) # see the data format is list of lists
        print(data[stock][0])
        close = data[stock][0][4]
        print(stock, "close:", close)


#visualize_data(data)

예제 #3

0

파일 보기

파일: main.py 프로젝트: tia-e/text-classification-clustering

def crossValidationExperimentWithDF(classLabel,
                                    isSample,
                                    sampleSize,
                                    cvType,
                                    cvFold,
                                    classifierName,
                                    featureVecName,
                                    dfFileName="../data/EFDataFrame.pk"):
    """
    Perfom cross-validation for a classification experiment, given dataframe representing the dataset (to classify).
    Two option are provided. Perform cross validation to find the right features and/or the right classifier.
    
    Parameters :
        classLabel : string
            The class Label, group_id or level_id
        isSample : boolean
            Indicate if the subsample of the dataset should be used for the classification
        sampleSize : float
            size of the sample dataset if it applies
        cvType : string
            One can perform cross-validation to feature selection development of for classification algorithm development.
        cvFold: int
            Number of folder for cross-validation
       classifierName : string
            Name of the classifier to use for the classifier : (naive bayes, logistic regression, knn ...)
            If perfoming 'on-feature' type of cross-validation, one needs to provide a classifier to cross-validate on.
        featureVecName : string
            The type of features Vector to use, tfidfVect, countVec, or customedfeatureVec.
            If perfoming 'on-classifier' type of cross-validation, one needs to provide a type of featureVec to cross-validate on.
        dFileName : string
            relative Path to the stored dataframe representing the whole dataset.
    Returns : 
        None
    """

    figName = '../figure/%d-FolcvExperiment_%s_%s_isSample=%r.pdf' % (
        cvFold, cvType, classLabel, isSample)
    figFilePath = os.path.join(currentFileDir, figName)
    logger.info('%s - load data...', classLabel)
    #Sampling
    logger.info('Data Sampling - %.2f percent of data', sampleSize * 100)
    if (isSample):
        dfFilePath = os.path.join(currentFileDir, dfFileName)
        sampleDfFileName = renameFileName(dfFileName,
                                          'sample=%.2f' % sampleSize)
        sampleDfFilePath = os.path.join(currentFileDir, sampleDfFileName)
        efdata = sampling(sampleSize, dfFilePath, sampleDfFilePath)
        logger.info('Sampled Data file is at the location - %s',
                    sampleDfFilePath)

    else:
        dfFilePath = os.path.join(currentFileDir, dfFileName)
        efdata = loadData('text', classLabel, dfFilePath)

    logger.info('Number of writings in working data : %r', efdata.shape[0])

    #Train -Test split
    logger.info('Train-test split : 80-20')
    xtrain_df, xtest_df, ytrain_df, ytest_df = train_test_split(
        efdata['text'], efdata[classLabel], random_state=0, test_size=0.2)

    #feature
    logger.info('Feature - %s ', featureVecName)
    featureVecFunction = createFeaturesVec[featureVecName]
    xtrain_vec, featureVec = featureVecFunction(efdata['text'], xtrain_df)
    xtest_vec = featureVec.transform(xtest_df)

    logger.info('Cross Validation - %s ...', cvType)
    if (cvType == 'on-feature'):
        #Cross Validation - Features Selection
        logger.info('Comparing Different Feature-Vectors')
        compareFeatures(classifierName, efdata, xtrain_df, ytrain_df, xtest_df,
                        cvFold)
    else:
        #Cross Validation - Algo Selection
        logger.info('Comparing Different Classifiers')
        scoring = 'all'
        compareAlgo(xtrain_vec, ytrain_df, cvFold, scoring, figFilePath)

    logger.info('End-Processing')

예제 #4

0

파일 보기

파일: main.py 프로젝트: tia-e/text-classification-clustering

def experimentWithDF(classLabel,
                     classifierName,
                     featureVecName,
                     isSample,
                     sampleSize=0.1,
                     dfFileName="../data/EFDataFrame.pk"):
    """
        Perfom classification experiment for a given dataframe representing the dataset (to classify).
        
        Parameters :
            classLabel : string
                The class Label, group_id or level_id
            classifierName : string
                Name of the classifier to use for the classifier : (naive bayes, logistic regression, knn ...)
            featureVecName : string
                The type of features Vector to use, tfidfVect, countVec, or customedfeatureVec.
            isSample : boolean
                Indicate if the subsample of the dataset should be used for the classification
            sampleSize : float
                size of the sample dataset if it applies
            dFileName : string
                relative Path to the stored dataframe representing the whole dataset.
        Returns : None
    """
    logger.info("Start: Logger file info here : %r", logFile)
    figName = '../figure/experiment_%s_isSample=%r.pdf' % (classLabel,
                                                           isSample)
    figFilePath = os.path.join(currentFileDir, figName)
    logger.info('%s - load data...', classLabel)

    #Sampling
    logger.info('Data Sampling - %.2f percent of data', sampleSize * 100)
    if (isSample):
        dfFilePath = os.path.join(currentFileDir, dfFileName)

        sampleDfFileName = renameFileName(dfFileName,
                                          'sample=%.2f' % sampleSize)
        sampleDfFilePath = os.path.join(currentFileDir, sampleDfFileName)
        efdata = sampling(sampleSize, dfFilePath, sampleDfFilePath)
        logger.info('Sampled Data file is at the location - %s',
                    sampleDfFilePath)

    else:
        dfFilePath = os.path.join(currentFileDir, dfFileName)
        efdata = loadData('text', classLabel, dfFilePath)

    logger.info('Number of writings in working data : %r', efdata.shape[0])

    #Train -Test split
    logger.info('train-test split...')
    xtrain_df, xtest_df, ytrain_df, ytest_df = train_test_split(
        efdata['text'], efdata[classLabel], random_state=0, test_size=0.2)

    #feature
    logger.info('features computation with %s ...', featureVecName)
    featureVecFunction = createFeaturesVec[featureVecName]
    xtrain_vec, featureVec = featureVecFunction(efdata['text'], xtrain_df)
    xtest_vec = featureVec.transform(xtest_df)

    #Test Algo
    logger.info('Test\n___________________________')
    testAlgo(classifierName, xtrain_vec, ytrain_df, xtest_vec, ytest_df,
             figFilePath)

    logger.info('END processing on dataframe')

예제 #5

0

파일 보기

preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer, numeric_features
), ('feed_col', feedback_feature_transformer,
    feedback_features), ('other_cat_col', other_cat_transformer,
                         other_cat_cols)])

from sklearn.ensemble import RandomForestClassifier
#Adding into Pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',
                       RandomForestClassifier(bootstrap=True,
                                              max_depth=30,
                                              max_features='auto',
                                              min_samples_leaf=1,
                                              n_estimators=100))])

data = pre.loadData()
#Getting X and y
X1 = data.drop(['Satisfaction', 'ArrivalDelayin_Mins'], axis=1)
y1 = pd.get_dummies(data['Satisfaction'])
#Data SPlit
from sklearn.model_selection import train_test_split  # Splitting the data for training and testing out model

X_trains, X_tests, y_trains, y_tests = train_test_split(X1,
                                                        y1,
                                                        random_state=1,
                                                        stratify=y1)
#Fitting Pipeline
clf.fit(X_trains, y_trains)

예제 #6

0

파일 보기

            print("\n\nCurrent configuration:  Task " + task + ";  Model " + model_name + ";  Subject", str(subject))
            print("Cascade: detecion + classification")
            # get test set
            if subject == 23:
                X_test = preprocessing.loadDataMultiple(label=label,
                                                        folder=data_folder,
                                                        window_size=window_size,
                                                        stride=stride,
                                                        make_binary=False,
                                                        null_class=True,
                                                        print_info=print_info)[2]
            else:
                X_test = preprocessing.loadData(subject=subject,
                                                label=label,
                                                folder=data_folder,
                                                window_size=window_size,
                                                stride=stride,
                                                make_binary=False,
                                                null_class=True,
                                                print_info=print_info)[2]
            # mask
            mask = (Y_pred_ad == 1)
            activity_windows = X_test[mask, :, :]
            if model_name == "Convolutional2DRecurrent":
                activity_windows = activity_windows.reshape(activity_windows.shape[0], window_size, X_test.shape[2], 1)
            Y_casc_ac = model.predict_classes(activity_windows) + 1  # last model saved is "activity classification"
            Y_casc = Y_pred_ad
            Y_casc[mask] = Y_casc_ac
            score_casc = f1_score(Y_true, Y_casc, average='weighted')
            print("Two-Steps results:\n", classification_report(Y_true, Y_casc))

            # store results as text

예제 #7

0

파일 보기

    i = None
    i = 0 if cat == 'Quinarius' else i
    i = 1 if cat == 'Denarius' else i
    i = 2 if cat == 'As' else i
    i = 3 if cat == 'Aureus' else i
    i = 4 if cat == 'Quinarius aureus' else i
    i = 5 if cat == 'Dupondius' else i
    i = 6 if cat == 'Quadrans' else i
    i = 7 if cat == 'Sestertius' else i
    i = 8 if cat == 'Semis' else i
    i = 9 if cat == 'Cistophorus' else i
    i = 10 if cat == 'Drachma' else i
    i = 11 if cat == 'Didrachm' else i
    i = 12 if cat == 'Hemidrachm' else i

    if i is not None:
        onehot[i] = 1

    return onehot


if __name__ == '__main__':

    cnn = CNN()
    props, imgs = loadData('dataframe.csv', './images')

    cnn.setMintModel()
    
    train(cnn, input_x, input_y, 10000, 100, './')

예제 #8

0

파일 보기

def cascade_classification(subject,
                           task,
                           model_name,
                           data_folder,
                           window_size=15,
                           stride=5,
                           epochs=15,
                           batch_size=32,
                           balcance_classes=False,
                           GPU=False,
                           print_info=False):

    # preprocessing
    if task == "A":
        label = 0
    elif task == "B":
        label = 6
    else:
        print("Error: invalid task.")
    if subject == 23:
        X_train, Y_train, X_test, Y_test, n_features, n_classes = preprocessing.loadDataMultiple(
            label=label,
            folder=data_folder,
            window_size=window_size,
            stride=stride,
            make_binary=False,
            null_class=False,
            print_info=print_info)
    else:
        X_train, Y_train, X_test, Y_test, n_features, n_classes = preprocessing.loadData(
            subject=subject,
            label=label,
            folder=data_folder,
            window_size=window_size,
            stride=stride,
            make_binary=False,
            null_class=False,
            print_info=print_info)

    # model
    if model_name == "Convolutional":
        model = models.Convolutional((window_size, n_features),
                                     n_classes,
                                     print_info=print_info)
    elif model_name == "Convolutional1DRecurrent":
        model = models.Convolutional1DRecurrent((window_size, n_features),
                                                n_classes,
                                                GPU=GPU,
                                                print_info=print_info)
    elif model_name == "Convolutional2DRecurrent":
        model = models.Convolutional2DRecurrent((window_size, n_features, 1),
                                                n_classes,
                                                GPU=GPU,
                                                print_info=print_info)
        # reshaping for 2D convolutional model
        X_train = X_train.reshape(X_train.shape[0], window_size, n_features, 1)
        X_test = X_test.reshape(X_test.shape[0], window_size, n_features, 1)
    elif model_name == "ConvolutionalDeepRecurrent":
        model = models.ConvolutionalDeepRecurrent((window_size, n_features),
                                                  n_classes,
                                                  GPU=GPU,
                                                  print_info=print_info)
    else:
        print("Model not found.")
    model.compile(optimizer=Adam(lr=0.001),
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    save_model_name = task + "_" + model_name + "_TSC_" + str(subject)
    filepath = './data/models/' + save_model_name + '.hdf5'
    print("Model:", save_model_name, "\nLocation:", filepath, "\n")

    # training
    checkpointer = ModelCheckpoint(filepath=filepath,
                                   verbose=1,
                                   save_best_only=True)
    lr_reducer = ReduceLROnPlateau(factor=0.1,
                                   patience=5,
                                   min_lr=0.00001,
                                   verbose=1)
    model.fit(x=X_train,
              y=to_categorical(Y_train),
              epochs=epochs,
              batch_size=batch_size,
              verbose=1,
              validation_data=(X_test, to_categorical(Y_test)),
              callbacks=[checkpointer, lr_reducer])

    return model, X_test, Y_test, filepath, save_model_name