def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_DATA = os.path.join("..", "..", "data", "features", "bag_of_words_sparse_matrix.p") #INPUT_LABELS = os.path.join("..", "..", "data", "features", "hier_labels.pkl") parser.add_argument('--id', '--input-data', type=str, dest='input_data', default=INPUT_DATA, help="File with input data in matrix format. Defaults to '%(default)s'") # parser.add_argument('--il', # '--input-labels', # type=str, # dest='input_labels', # default=INPUT_LABELS, # help="File with input labels. Defaults to '%(default)s'") ####PCA command line argument should go here (wether to do it or not and how much of the energy to be kept) args = parser.parse_args() np.random.seed(123) ipdb.set_trace() clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() clf4 = DecisionTreeClassifier() clf5 = AdaBoostClassifier() print('5-fold cross validation:\n') #X = counted # sparse matrix input #X = tfidf #X = counted_bigr #y = data_str.iloc[:, 0] sparse_mat = pickle.load( open( args.input_data, "rb" ) ) X = sparse_mat.iloc[:, 2] y = sparse_mat.iloc[:, 1] ###### if command line argument for PCA is True, then perform PCA on X here! # Ensemble classifier eclf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3, clf4, clf5], voting='hard') #eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[2,1,5]) # average probabilities, soft voting for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Decision Tree', 'AdaBoost', 'Ensemble']): scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy') #scores = cross_validation.cross_val_score(clf3, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), label))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "bow_string_input_dframe.p") OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.csv') parser.add_argument( '-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help= "File with the list of item classes and features. Defaults to '%(default)s'" ) parser.add_argument( '-g', '--gridres-file', type=str, dest='gridres_file', default=None, help= "File with the best parameters of the grid search. Defaults to '%(default)s'" ) parser.add_argument( '-n', '-ngrams', dest='nGrams', type=int, default=1, nargs='+', help= 'Defines how to split words by ngrams. Default is tokenized to one word ngrams' ) parser.add_argument('--ti', dest='tf_idf', action='store_true', default=False, help='Boolean - If set, TfIdf features will be used') parser.add_argument( '-c', '--classifier', type=str, dest='classifier', default="ensemble", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument( '--cat', type=str, dest='category', default='income-type', choices=('income-type', 'income-source', 'expenditure-type'), help="The type of categorization. Defaults to '%(default)s'") parser.add_argument( '--od', '--output-file-desc', type=str, dest='output_file_desc', default=OUTPUT_FILE_DESC, help= "A csv file to output the predicted labels. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary data_in = pandas.read_pickle(args.input_file) type_classes = list(data_in['type_class']) source_classes = list(data_in['source_class']) frID = list(data_in['frID']) data_orig = data_in['description'] if args.category == 'income-type' or args.category == 'expenditure-type': labels_orig = [ str(i) for i in type_classes ] # converting them to strings if they are not strings already else: labels_orig = [ str(i) for i in source_classes ] # converting them to strings if they are not strings already if args.gridres_file is not None: gridres_params = pickle.load(open(args.gridres_file, "rb"))[0] else: gridres_params = None if args.classifier == 'decision-tree': max_depth = gridres_params[ 'clf__max_depth'] if gridres_params is not None else 100 clf = tree.DecisionTreeClassifier(max_depth=max_depth, criterion='entropy') print("Will run a decision tree with max depth=%d" % max_depth) elif args.classifier == 'random-forest': max_depth = gridres_params[ 'clf__max_depth'] if gridres_params is not None else 100 clf = ensemble.RandomForestClassifier(max_depth=max_depth, criterion='entropy') print("Will run a random forest with max depth=%d" % max_depth) elif args.classifier == 'logistic-regression': C = gridres_params['clf__C'] if gridres_params is not None else 1 clf = linear_model.LogisticRegression(C=C) print("Will run logistic regressor with C=%d" % C) else: # ensemble clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy') clf2 = ensemble.RandomForestClassifier() clf3 = linear_model.LogisticRegression() clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard') vectorizer = feature_extraction.text.CountVectorizer( analyzer='word', #whether should be made ofword or char n-grams binary= False, # if True all non-zero counts are set to one - used for probabilistic mapping decode_error= 'strict', # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform() encoding="ISO-8859-15", # input='content', # can be 'file', 'filename' or 'content' lowercase= False, #Convert all characters to lowercase before tokenizing. max_df= 1.0, # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None." max_features= None, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. ngram_range=( 1, args.nGrams ), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. preprocessor= None, # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. stop_words=None, # min_df=1, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) if args.tf_idf == True: transformer = feature_extraction.text.TfidfTransformer() ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('transformer', transformer), ('clf', clf), ]) else: ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('clf', clf), ]) k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True) labels_predicted = [-1] * len(labels_orig) accuracy = [] for train_idx, dev_idx in k_fold: data_train = [data_orig[i] for i in train_idx] data_dev = [data_orig[i] for i in dev_idx] labels_train = [labels_orig[i] for i in train_idx] labels_dev = [labels_orig[i] for i in dev_idx] ppl.fit(data_train, labels_train) predicted_dev = ppl.predict(data_dev) labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx) accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)] print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy))) # Save the predicted classes to_dump = [labels_orig, labels_predicted] helpers.ensure_dir(os.path.dirname(args.output_file_desc)) #create a dataframe to output type class, predicted type class and description data if args.category == 'income-type' or args.category == 'expenditure-type': dump_op_desc = pandas.DataFrame({ 'frID': frID, 'type_class': labels_orig, 'type_class_predicted': labels_predicted, 'description': data_orig }) else: dump_op_desc = pandas.DataFrame({ 'frID': frID, 'source_class': labels_orig, 'source_class_predicted': labels_predicted, 'description': data_orig }) dump_op_desc.to_csv(args.output_file_desc)
clf1 = LogisticRegression() clf2 = DecisionTreeClassifier() #X = pickle.load(open('../../data/features/hier_data.pkl', 'rb')) #X = X.toarray() #y = pickle.load(open('../../data/features/hier_labels.pkl', 'rb')) N1 = math.floor(X.shape[1] / 2) Nend = X.shape[1] vec1 = range(0, N1) vec2 = range(N1, Nend) pipe1 = Pipeline([ ('sel', ensemble_classifier.ColumnSelector(vec1)), # use only the 1st feature ('clf', clf1) ]) pipe2 = Pipeline([ ('sel', ensemble_classifier.ColumnSelector(vec2)), # use the 1st and 2nd feature ('dim', LDA(n_components=1)), # Dimensionality reduction via LDA ('clf', clf2) ]) eclf = ensemble_classifier.EnsembleClassifier([pipe1, pipe2]) scores = cross_validation.cross_val_score(eclf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'pipeline classifier'))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") OUTPUT_FILE = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.pkl') parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'") parser.add_argument('-c', '--classifier', type=str, dest='classifier', default="ensemble", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument('--orig-labels', dest='orig_labels', action='store_true', default=False, help='Boolean - If set, the original data labels will be stored. Otherwise, they will be coded as integers') parser.add_argument('--cat', type=str, dest='category', default='income-type', choices=('income-type','income-source','expenditure-type'), help="The type of categorization. Defaults to '%(default)s'") parser.add_argument('-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="A file to output the predicted labels. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb")) import ipdb; ipdb.set_trace() # get all the label data if args.category == 'income-type' or args.category == 'expenditure-type': labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already else: labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already #labels_orig = [type_dict[x] for x in type_classes] data_orig = token_container if args.classifier == 'decision-tree': clf = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy') elif args.classifier == 'random-forest': clf = ensemble.RandomForestClassifier() elif args.classifier == 'logistic-regression': clf = linear_model.LogisticRegression() else: # ensemble clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy') clf2 = ensemble.RandomForestClassifier() clf3 = linear_model.LogisticRegression() clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard') ppl = pipeline.Pipeline([ ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), #sparse=True ('clf', clf), ]) k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True) #labels_predicted = numpy.array([-1] * len(labels_orig), dtype='int') labels_predicted = [-1] * len(labels_orig) accuracy = [] for train_idx, dev_idx in k_fold: data_train = [data_orig[i] for i in train_idx] data_dev = [data_orig[i] for i in dev_idx] labels_train = [labels_orig[i] for i in train_idx] labels_dev = [labels_orig[i] for i in dev_idx] ppl.fit(data_train, labels_train) predicted_dev = ppl.predict(data_dev) #labels_predicted[dev_idx] = predicted_dev labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx) accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)] print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy))) # Save the predicted classes #inv_type_dict = {v: k for k, v in type_dict.items()} to_dump = [labels_orig, labels_predicted] helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump(to_dump, open(args.output_file, "wb")) ''' predicted_type_classes = [inv_type_dict[x] for x in labels_predicted] if args.orig_labels: # save the original labels predicted_type_classes = [inv_type_dict[x] for x in labels_predicted] to_dump = [type_classes, predicted_type_classes] else: # save the labels codified with integer numbers, as well as the decoding dictionary to_dump = [labels_orig, labels_predicted, inv_type_dict] '''
label =['Logistic Regression', 'Random Forest', 'Decision Tree'] print(label) df = pd.DataFrame(columns=('w1', 'w2', 'w3', 'mean', 'std')) i = 0 for w1 in range(0,3): for w2 in range(0,3): for w3 in range(0,3): if len(set((w1,w2,w3))) == 1: # skip if all weights are equal continue eclf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[w1,w2,w3]) scores = cross_validation.cross_val_score( estimator=eclf, X=X, y=y, cv=5, scoring='accuracy', n_jobs=1) df.loc[i] = [w1, w2, w3, scores.mean(), scores.std()] print("Accuracy: %0.5f (+/- %0.5f) w1=%d w2=%d w3=%d" % (scores.mean(), scores.std(), w1, w2, w3)) i += 1 df.sort(columns=['mean', 'std'], ascending=False) # printing out the results: # w1, w2, w3, mean (averaged over k-folds), std