예제 #1
0
def transform_test_features(data, feature_vectorizer, training_feature_save,
                            sys_out, scaling_option):
    # test data must be represented in a feature matrix of the same dimension of the training data feature matrix
    # step 1: reconstruct empty feature matrix using the vocabularies seen at training time
    logger.info("\n\nEXTRACTING TEST DATA FEATURS...")
    meta_TEST = util.feature_extraction(data, feature_vectorizer, sys_out,
                                        logger)

    logger.info("\nFEATURE SELECTION ON TEST DATA...")
    train_features = create_training_features(training_feature_save)
    # step 2: create test data features
    M_features_by_type = meta_TEST[1]
    # step 3: map test data features to training data features and populate the empty feature matrix
    featurematrix = map_to_trainingfeatures(train_features, M_features_by_type)
    featurematrix = util.feature_scale(scaling_option, featurematrix)

    return featurematrix
예제 #2
0
def ml_tag(tweets, feat_vectorizer, model, selected_features,
           hate_indicative_features, scaling, sysout, logger,
           solr_tags: SolrClient, core_name):
    logger.info("creating features and applying classification...{}".format(
        datetime.datetime.now()))
    tweets_cleaned = [
        text_preprocess.preprocess_clean(x, True, True) for x in tweets
    ]
    M = feat_vectorizer.transform_inputs(tweets, tweets_cleaned, sysout, "na")

    X_test_selected = ct.map_to_trainingfeatures(selected_features, M[1])
    X_test_selected = util.feature_scale(scaling, X_test_selected)
    labels = model.predict(X_test_selected)
    logger.info("computing hate risk scores... {}".format(
        datetime.datetime.now()))
    scores = compute_hate_riskscore_by_features(X_test_selected, M[1],
                                                hate_indicative_features,
                                                solr_tags, core_name)

    return labels, scores
예제 #3
0
    def train_test(self):
        self.load_data()
        meta_TRAIN = util.feature_extraction(self.raw_train.tweet, self.feat_v,
                                             self.sys_out, ec.logger)
        X_train = meta_TRAIN[0]
        X_train = util.feature_scale(SCALING_STRATEGY, X_train)
        y_train = self.raw_train['class'].astype(int)

        saved_feature_dir = self.sys_out + "/fs/"
        try:
            os.stat(saved_feature_dir)
        except:
            os.mkdir(saved_feature_dir)
        training_feature_save = saved_feature_dir + str(
            self.fs_option) + ".csv"

        select = cg.create_feature_selector(self.fs_option, False)[0]
        feature_selected = select is not None
        if feature_selected:
            X_train = select.fit_transform(X_train, y_train)
            feature_idx_stat = select.get_support()
            feature_idx = []
            for i, item in enumerate(feature_idx_stat):
                if item:
                    feature_idx.append(i)
            util.save_selected_features(feature_idx, meta_TRAIN[1],
                                        training_feature_save)
        else:
            feature_idx = []
            for i in range(0, len(X_train[0])):
                feature_idx.append(i)
            util.save_selected_features(feature_idx, meta_TRAIN[1],
                                        training_feature_save)
        ec.logger.info("FEATURE SELECTION={}, Shape={}".format(
            select, X_train.shape))

        X_test = self.transform_test_features(SCALING_STRATEGY, self.feat_v,
                                              training_feature_save,
                                              self.sys_out)
        y_test = self.raw_test['class'].astype(int)

        ######################### SGDClassifier #######################
        if WITH_SGD:
            classifier = ct.create_classifier('sgd', self.sys_out,
                                              self.task_name, -1, 300)
            self.traintest(classifier[0], X_train, y_train, X_test, y_test,
                           self.sys_out, 'sgd', self.task_name)

        ######################### Stochastic Logistic Regression#######################
        if WITH_SLR:
            classifier = ct.create_classifier('lr', self.sys_out,
                                              self.task_name, -1, 300)
            self.traintest(classifier[0], X_train, y_train, X_test, y_test,
                           self.sys_out, 'lr', self.task_name)

        ######################### Random Forest Classifier #######################
        if WITH_RANDOM_FOREST:
            classifier = ct.create_classifier('rf', self.sys_out,
                                              self.task_name, -1, 300)
            self.traintest(classifier[0], X_train, y_train, X_test, y_test,
                           self.sys_out, 'rf', self.task_name)

        ###################  liblinear SVM ##############################
        if WITH_LIBLINEAR_SVM:
            classifier = ct.create_classifier('svm-l', self.sys_out,
                                              self.task_name, -1, 300)
            self.traintest(classifier[0], X_train, y_train, X_test, y_test,
                           self.sys_out, 'svm-l', self.task_name)

        ##################### RBF svm #####################
        if WITH_RBF_SVM:
            classifier = ct.create_classifier('svm-rbf', self.sys_out,
                                              self.task_name, -1, 300)
            self.traintest(classifier[0], X_train, y_train, X_test, y_test,
                           self.sys_out, 'svm-rbf', self.task_name)
예제 #4
0
    def gridsearch(self):
        meta_M = util.feature_extraction(self.raw_data.tweet, self.feat_v,
                                         self.sys_out, ec.logger,
                                         self.cleaned_data.tweet)
        M = meta_M[0]
        #M=self.feature_scale(M)

        # split the dataset into two parts, 0.75 for train and 0.25 for testing
        X_train_data, X_test_data, y_train, y_test,index_train, index_test = \
            train_test_split(M, self.raw_data['class'],
                             list(self.raw_data.index.values),
                             test_size=TEST_SPLIT_PERCENT,
                             random_state=42)
        X_train_data = util.feature_scale(SCALING_STRATEGY, X_train_data)
        X_test_data = util.feature_scale(SCALING_STRATEGY, X_test_data)
        y_train = y_train.astype(int)
        y_test = y_test.astype(int)

        instance_data_source_column = None
        accepted_ds_tags = None
        if self.output_scores_per_ds:
            instance_data_source_column = pd.Series(self.raw_data.ds)
            accepted_ds_tags = None

        # #if not self.feature_selection:
        # print("APPLYING FEATURE SCALING: [%s]" % SCALING_STRATEGY)
        #
        # if SCALING_STRATEGY == SCALING_STRATEGY_MEAN_STD:
        #     X_train_data = util.feature_scaling_mean_std(X_train_data)
        #     X_test_data = util.feature_scaling_mean_std(X_test_data)
        # elif SCALING_STRATEGY == SCALING_STRATEGY_MIN_MAX:
        #     X_train_data = util.feature_scaling_min_max(X_train_data)
        #     X_test_data = util.feature_scaling_min_max(X_test_data)
        # else:
        #     raise ArithmeticError("SCALING STRATEGY IS NOT SET CORRECTLY!")

        ######################### SGDClassifier #######################
        if WITH_SGD:
            cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name,
                             LOAD_MODEL_FROM_FILE, "sgd", meta_M[1],
                             X_train_data, y_train, X_test_data, y_test,
                             self.sys_out, self.cl_gridsearch, self.dr_option,
                             self.dr_gridsearch, self.fs_option,
                             self.fs_gridsearch, instance_data_source_column,
                             accepted_ds_tags)

        ######################### Stochastic Logistic Regression#######################
        if WITH_SLR:
            cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name,
                             LOAD_MODEL_FROM_FILE, "lr", meta_M[1],
                             X_train_data, y_train, X_test_data, y_test,
                             self.sys_out, self.cl_gridsearch, self.dr_option,
                             self.dr_gridsearch, self.fs_option,
                             self.fs_gridsearch, instance_data_source_column,
                             accepted_ds_tags)

        ######################### Random Forest Classifier #######################
        if WITH_RANDOM_FOREST:
            cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name,
                             LOAD_MODEL_FROM_FILE, "rf", meta_M[1],
                             X_train_data, y_train, X_test_data, y_test,
                             self.sys_out, self.cl_gridsearch, self.dr_option,
                             self.dr_gridsearch, self.fs_option,
                             self.fs_gridsearch, instance_data_source_column,
                             accepted_ds_tags)

        ###################  liblinear SVM ##############################
        if WITH_LIBLINEAR_SVM:
            cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name,
                             LOAD_MODEL_FROM_FILE, "svml", meta_M[1],
                             X_train_data, y_train, X_test_data, y_test,
                             index_train, index_test, self.sys_out,
                             self.cl_gridsearch, self.dr_option,
                             self.dr_gridsearch, self.fs_option,
                             self.fs_gridsearch, instance_data_source_column,
                             accepted_ds_tags)

        ##################### RBF svm #####################
        if WITH_RBF_SVM:
            cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name,
                             LOAD_MODEL_FROM_FILE, "svmrbf", meta_M[1],
                             X_train_data, y_train, X_test_data, y_test,
                             self.sys_out, self.cl_gridsearch, self.dr_option,
                             self.dr_gridsearch, self.fs_option,
                             self.fs_gridsearch, instance_data_source_column,
                             accepted_ds_tags)
        ec.logger.info("complete, {}".format(datetime.datetime.now()))
예제 #5
0
    def gridsearch_with_selectedfeatures(self, intersecton_only, *files):
        selected_features = util.read_preselected_features(
            intersecton_only, *files)
        self.load_data()
        M = util.feature_extraction(self.raw_train.tweet, self.feat_v,
                                    self.sys_out, ec.logger)
        M0 = pd.DataFrame(M[0])
        X_train_data, X_test_data, y_train, y_test = \
        train_test_split(M0, self.raw_train['class'],
                             test_size=cgm.TEST_SPLIT_PERCENT,
                             random_state=42)
        y_train = y_train.astype(int)
        y_test = y_test.astype(int)

        instance_data_source_column = None
        accepted_ds_tags = None
        if self.output_scores_per_ds:
            instance_data_source_column = pd.Series(self.raw_train.ds)
            accepted_ds_tags = ["c", "td"]

        ec.logger.info("TRANSFORM TRAINING DATA TO PRE-SELECTED FEATURE SPACE")
        X_train_selected = ct.map_to_trainingfeatures(selected_features, M[1],
                                                      X_train_data.index)
        X_train_selected = util.feature_scale(SCALING_STRATEGY,
                                              X_train_selected)
        ec.logger.info(X_train_selected.shape)
        ec.logger.info("TRANSFORM TESTING DATA TO PRE-SELECTED FEATURE SPACE")
        X_test_selected = ct.map_to_trainingfeatures(selected_features, M[1],
                                                     X_test_data.index)
        X_test_selected = util.feature_scale(SCALING_STRATEGY, X_test_selected)
        ec.logger.info(X_test_selected.shape)

        ######################### SGDClassifier #######################
        if WITH_SGD:
            cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION,
                             self.task_name, LOAD_MODEL_FROM_FILE, "sgd", M[1],
                             X_train_selected, y_train, X_test_selected,
                             y_test, self.identifier, self.sys_out, False, -1,
                             False, 99, False, instance_data_source_column,
                             accepted_ds_tags)

        ######################### Stochastic Logistic Regression#######################
        if WITH_SLR:
            cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION,
                             self.task_name, LOAD_MODEL_FROM_FILE, "lr", M[1],
                             X_train_selected, y_train, X_test_selected,
                             y_test, self.identifier, self.sys_out, False, -1,
                             False, 99, False, instance_data_source_column,
                             accepted_ds_tags)

        ######################### Random Forest Classifier #######################
        if WITH_RANDOM_FOREST:
            cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION,
                             self.task_name, LOAD_MODEL_FROM_FILE, "rf", M[1],
                             X_train_selected, y_train, X_test_selected,
                             y_test, self.identifier, self.sys_out, False, -1,
                             False, 99, False, instance_data_source_column,
                             accepted_ds_tags)

        ###################  liblinear SVM ##############################
        if WITH_LIBLINEAR_SVM:
            cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION,
                             self.task_name, LOAD_MODEL_FROM_FILE, "svml",
                             M[1], X_train_selected, y_train, X_test_selected,
                             y_test, self.identifier, self.sys_out, False, -1,
                             False, 99, False, instance_data_source_column,
                             accepted_ds_tags)

        ##################### RBF svm #####################
        if WITH_RBF_SVM:
            cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION,
                             self.task_name, LOAD_MODEL_FROM_FILE, "svmrbf",
                             M[1], X_train_selected, y_train, X_test_selected,
                             y_test, self.identifier, self.sys_out, False, -1,
                             False, 99, False, instance_data_source_column,
                             accepted_ds_tags)