def transform_test_features(data, feature_vectorizer, training_feature_save, sys_out, scaling_option): # test data must be represented in a feature matrix of the same dimension of the training data feature matrix # step 1: reconstruct empty feature matrix using the vocabularies seen at training time logger.info("\n\nEXTRACTING TEST DATA FEATURS...") meta_TEST = util.feature_extraction(data, feature_vectorizer, sys_out, logger) logger.info("\nFEATURE SELECTION ON TEST DATA...") train_features = create_training_features(training_feature_save) # step 2: create test data features M_features_by_type = meta_TEST[1] # step 3: map test data features to training data features and populate the empty feature matrix featurematrix = map_to_trainingfeatures(train_features, M_features_by_type) featurematrix = util.feature_scale(scaling_option, featurematrix) return featurematrix
def ml_tag(tweets, feat_vectorizer, model, selected_features, hate_indicative_features, scaling, sysout, logger, solr_tags: SolrClient, core_name): logger.info("creating features and applying classification...{}".format( datetime.datetime.now())) tweets_cleaned = [ text_preprocess.preprocess_clean(x, True, True) for x in tweets ] M = feat_vectorizer.transform_inputs(tweets, tweets_cleaned, sysout, "na") X_test_selected = ct.map_to_trainingfeatures(selected_features, M[1]) X_test_selected = util.feature_scale(scaling, X_test_selected) labels = model.predict(X_test_selected) logger.info("computing hate risk scores... {}".format( datetime.datetime.now())) scores = compute_hate_riskscore_by_features(X_test_selected, M[1], hate_indicative_features, solr_tags, core_name) return labels, scores
def train_test(self): self.load_data() meta_TRAIN = util.feature_extraction(self.raw_train.tweet, self.feat_v, self.sys_out, ec.logger) X_train = meta_TRAIN[0] X_train = util.feature_scale(SCALING_STRATEGY, X_train) y_train = self.raw_train['class'].astype(int) saved_feature_dir = self.sys_out + "/fs/" try: os.stat(saved_feature_dir) except: os.mkdir(saved_feature_dir) training_feature_save = saved_feature_dir + str( self.fs_option) + ".csv" select = cg.create_feature_selector(self.fs_option, False)[0] feature_selected = select is not None if feature_selected: X_train = select.fit_transform(X_train, y_train) feature_idx_stat = select.get_support() feature_idx = [] for i, item in enumerate(feature_idx_stat): if item: feature_idx.append(i) util.save_selected_features(feature_idx, meta_TRAIN[1], training_feature_save) else: feature_idx = [] for i in range(0, len(X_train[0])): feature_idx.append(i) util.save_selected_features(feature_idx, meta_TRAIN[1], training_feature_save) ec.logger.info("FEATURE SELECTION={}, Shape={}".format( select, X_train.shape)) X_test = self.transform_test_features(SCALING_STRATEGY, self.feat_v, training_feature_save, self.sys_out) y_test = self.raw_test['class'].astype(int) ######################### SGDClassifier ####################### if WITH_SGD: classifier = ct.create_classifier('sgd', self.sys_out, self.task_name, -1, 300) self.traintest(classifier[0], X_train, y_train, X_test, y_test, self.sys_out, 'sgd', self.task_name) ######################### Stochastic Logistic Regression####################### if WITH_SLR: classifier = ct.create_classifier('lr', self.sys_out, self.task_name, -1, 300) self.traintest(classifier[0], X_train, y_train, X_test, y_test, self.sys_out, 'lr', self.task_name) ######################### Random Forest Classifier ####################### if WITH_RANDOM_FOREST: classifier = ct.create_classifier('rf', self.sys_out, self.task_name, -1, 300) self.traintest(classifier[0], X_train, y_train, X_test, y_test, self.sys_out, 'rf', self.task_name) ################### liblinear SVM ############################## if WITH_LIBLINEAR_SVM: classifier = ct.create_classifier('svm-l', self.sys_out, self.task_name, -1, 300) self.traintest(classifier[0], X_train, y_train, X_test, y_test, self.sys_out, 'svm-l', self.task_name) ##################### RBF svm ##################### if WITH_RBF_SVM: classifier = ct.create_classifier('svm-rbf', self.sys_out, self.task_name, -1, 300) self.traintest(classifier[0], X_train, y_train, X_test, y_test, self.sys_out, 'svm-rbf', self.task_name)
def gridsearch(self): meta_M = util.feature_extraction(self.raw_data.tweet, self.feat_v, self.sys_out, ec.logger, self.cleaned_data.tweet) M = meta_M[0] #M=self.feature_scale(M) # split the dataset into two parts, 0.75 for train and 0.25 for testing X_train_data, X_test_data, y_train, y_test,index_train, index_test = \ train_test_split(M, self.raw_data['class'], list(self.raw_data.index.values), test_size=TEST_SPLIT_PERCENT, random_state=42) X_train_data = util.feature_scale(SCALING_STRATEGY, X_train_data) X_test_data = util.feature_scale(SCALING_STRATEGY, X_test_data) y_train = y_train.astype(int) y_test = y_test.astype(int) instance_data_source_column = None accepted_ds_tags = None if self.output_scores_per_ds: instance_data_source_column = pd.Series(self.raw_data.ds) accepted_ds_tags = None # #if not self.feature_selection: # print("APPLYING FEATURE SCALING: [%s]" % SCALING_STRATEGY) # # if SCALING_STRATEGY == SCALING_STRATEGY_MEAN_STD: # X_train_data = util.feature_scaling_mean_std(X_train_data) # X_test_data = util.feature_scaling_mean_std(X_test_data) # elif SCALING_STRATEGY == SCALING_STRATEGY_MIN_MAX: # X_train_data = util.feature_scaling_min_max(X_train_data) # X_test_data = util.feature_scaling_min_max(X_test_data) # else: # raise ArithmeticError("SCALING STRATEGY IS NOT SET CORRECTLY!") ######################### SGDClassifier ####################### if WITH_SGD: cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "sgd", meta_M[1], X_train_data, y_train, X_test_data, y_test, self.sys_out, self.cl_gridsearch, self.dr_option, self.dr_gridsearch, self.fs_option, self.fs_gridsearch, instance_data_source_column, accepted_ds_tags) ######################### Stochastic Logistic Regression####################### if WITH_SLR: cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "lr", meta_M[1], X_train_data, y_train, X_test_data, y_test, self.sys_out, self.cl_gridsearch, self.dr_option, self.dr_gridsearch, self.fs_option, self.fs_gridsearch, instance_data_source_column, accepted_ds_tags) ######################### Random Forest Classifier ####################### if WITH_RANDOM_FOREST: cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "rf", meta_M[1], X_train_data, y_train, X_test_data, y_test, self.sys_out, self.cl_gridsearch, self.dr_option, self.dr_gridsearch, self.fs_option, self.fs_gridsearch, instance_data_source_column, accepted_ds_tags) ################### liblinear SVM ############################## if WITH_LIBLINEAR_SVM: cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "svml", meta_M[1], X_train_data, y_train, X_test_data, y_test, index_train, index_test, self.sys_out, self.cl_gridsearch, self.dr_option, self.dr_gridsearch, self.fs_option, self.fs_gridsearch, instance_data_source_column, accepted_ds_tags) ##################### RBF svm ##################### if WITH_RBF_SVM: cl.learn_general(NUM_CPU, N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "svmrbf", meta_M[1], X_train_data, y_train, X_test_data, y_test, self.sys_out, self.cl_gridsearch, self.dr_option, self.dr_gridsearch, self.fs_option, self.fs_gridsearch, instance_data_source_column, accepted_ds_tags) ec.logger.info("complete, {}".format(datetime.datetime.now()))
def gridsearch_with_selectedfeatures(self, intersecton_only, *files): selected_features = util.read_preselected_features( intersecton_only, *files) self.load_data() M = util.feature_extraction(self.raw_train.tweet, self.feat_v, self.sys_out, ec.logger) M0 = pd.DataFrame(M[0]) X_train_data, X_test_data, y_train, y_test = \ train_test_split(M0, self.raw_train['class'], test_size=cgm.TEST_SPLIT_PERCENT, random_state=42) y_train = y_train.astype(int) y_test = y_test.astype(int) instance_data_source_column = None accepted_ds_tags = None if self.output_scores_per_ds: instance_data_source_column = pd.Series(self.raw_train.ds) accepted_ds_tags = ["c", "td"] ec.logger.info("TRANSFORM TRAINING DATA TO PRE-SELECTED FEATURE SPACE") X_train_selected = ct.map_to_trainingfeatures(selected_features, M[1], X_train_data.index) X_train_selected = util.feature_scale(SCALING_STRATEGY, X_train_selected) ec.logger.info(X_train_selected.shape) ec.logger.info("TRANSFORM TESTING DATA TO PRE-SELECTED FEATURE SPACE") X_test_selected = ct.map_to_trainingfeatures(selected_features, M[1], X_test_data.index) X_test_selected = util.feature_scale(SCALING_STRATEGY, X_test_selected) ec.logger.info(X_test_selected.shape) ######################### SGDClassifier ####################### if WITH_SGD: cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "sgd", M[1], X_train_selected, y_train, X_test_selected, y_test, self.identifier, self.sys_out, False, -1, False, 99, False, instance_data_source_column, accepted_ds_tags) ######################### Stochastic Logistic Regression####################### if WITH_SLR: cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "lr", M[1], X_train_selected, y_train, X_test_selected, y_test, self.identifier, self.sys_out, False, -1, False, 99, False, instance_data_source_column, accepted_ds_tags) ######################### Random Forest Classifier ####################### if WITH_RANDOM_FOREST: cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "rf", M[1], X_train_selected, y_train, X_test_selected, y_test, self.identifier, self.sys_out, False, -1, False, 99, False, instance_data_source_column, accepted_ds_tags) ################### liblinear SVM ############################## if WITH_LIBLINEAR_SVM: cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "svml", M[1], X_train_selected, y_train, X_test_selected, y_test, self.identifier, self.sys_out, False, -1, False, 99, False, instance_data_source_column, accepted_ds_tags) ##################### RBF svm ##################### if WITH_RBF_SVM: cg.learn_general(cgm.NUM_CPU, cgm.N_FOLD_VALIDATION, self.task_name, LOAD_MODEL_FROM_FILE, "svmrbf", M[1], X_train_selected, y_train, X_test_selected, y_test, self.identifier, self.sys_out, False, -1, False, 99, False, instance_data_source_column, accepted_ds_tags)