def run_experiment(self): path = '/home/hmayun/PycharmProjects/create-dataset-r/segment-level-7-categories/' database = 'mmhsct' # database = 'srft' for random_state in self.random_states: training_file = path + database + '_segments_train_' + str( random_state) + '.csv' test_file = path + database + '_segments_test_' + str( random_state) + '.csv' training_data = self.utilities.read_from_csv(training_file) X_train = [] y_train = [] for row in training_data: X_train.append(row[0]) y_train.append(row[1]) test_data = self.utilities.read_from_csv(test_file) X_test = [] y_test = [] for row in test_data: X_test.append(row[0]) y_test.append(row[1]) model = self.aspect_classifier.fit(X_train, y_train) y_pred = model.predict(X_test) print clsr(y_test, y_pred, digits=4) print accuracy_score(y_test, y_pred)
def with_wv(model, Xtr, ytr, Xte, yte): """ Experimental method to test the word vector feature """ word_vectors = KeyedVectors.load_word2vec_format( '~/GoogleNews-vectors-negative300.bin', binary=True) # C binary format prep = NLTKPreprocessor(stem=False) vect = TfidfVectorizer(preprocessor=prep, lowercase=True, stop_words='english', ngram_range=(1, 2)) X_tr_mat = vect.fit_transform(Xtr["Message"]) X_te_mat = vect.transform(Xte["Message"]) vocab = vect.get_feature_names() imp_words = list(zip(*analysis(model, Xte, yte))) #imp_indices = list(imp_words[2]) #vocab = np.array(vocab) words = list(imp_words[0]) imp_names = extract_full_words(prep, words, vocab) X_tr_mat = model.named_steps['vect'].transform(Xtr["Message"]) X_te_mat = model.named_steps['vect'].transform(Xte["Message"]) vectors_train = generate_word_vectors_avg(word_vectors, imp_names, Xtr["Message"]) vectors_test = generate_word_vectors_avg(word_vectors, imp_names, Xte["Message"]) X_tr_mat = sp.sparse.hstack((X_tr_mat, vectors_train), format='csr') X_te_mat = sp.sparse.hstack((X_te_mat, vectors_test), format='csr') cls = model.named_steps['classif'] cls.fit(X_tr_mat, ytr) y_pred = (cls.predict(X_te_mat)) print(clsr(yte, y_pred))
def evaluate_on_test_split(self, X_test, y_test, verbose=True): if verbose: print("Classification Report:\n") y_pred = self.model.predict(X_test) print(clsr(y_test, y_pred)) score = accuracy_score(y_test, y_pred) return y_pred, score
def buildnEvaluateModel(X, y): ''' The function takes training data and splits it further into Training and Cross-validate sets. And returns the model. ''' # Split the traning data input to get 20% cross-validation data set # for model evaluation X_train, X_cv, y_train, y_cv = tts(X, y, test_size=0.2) #convert dataframe with float valaues into bool y_train = [bool(int(i)) for i in y_train] y_cv = [bool(int(i)) for i in y_cv] #output classification labels labels = LabelEncoder() labels.fit_transform(y_train) # define classification model text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='linear', probability=True)), ]) #Traning the model text_clf = text_clf.fit(X_train, y_train) ''' Following section evaluates the model performance ''' predicted = text_clf.predict(X_cv) print("Model Accuracy = " + str(np.mean(predicted == y_cv))) print(clsr(y_cv, predicted, target_names=[str(i) for i in labels.classes_])) return text_clf
def build_and_evaluate(X_train, X_test, y_train, y_test, outpath=None): # Create tokenizer tokenizer = BertTokenizer() tokenizer.setup() # SVM Classifier svm = Sklearn_SVM() svm = svm.setup() train_tokenized = tokenizer.process_batch(X_train) print("Training model......") svm.fit(train_tokenized, y_train) print("Predicting test data...") test_tokenized = tokenizer.process_batch(X_test) y_pred = svm.process_batch(test_tokenized) print("Classification Report:\n") print(clsr(y_test, y_pred, target_names=set(y))) if outpath: with open(outpath, "wb") as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return svm
def build_and_evaluate(X, y, classifier=svm.SVC, verbose=True): def build(classifier, X, y=None): if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ( 'union', FeatureUnion(transformer_list=[ ( 'bag_words', Pipeline([ ('preprocessor', NLTKPreprocessor()), #('tfidf', TfidfVectorizer(ngram_range=(1, 2), tokenizer=identity, preprocessor=None, lowercase=False)), #('tfidf', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words='english')), ( 'topics_and_ngrams', FeatureUnion(transformer_list=[ ('grams', Pipeline([( 'ngram', TfidfVectorizer(ngram_range=(1, 2), tokenizer=identity, preprocessor=None, lowercase=False) ), ('best', TruncatedSVD(n_components=50))])), #('topics', Pipeline([ # ('tfid', TfidfVectorizer(ngram_range=(1, 1), tokenizer=identity, preprocessor=None, lowercase=False)), # ('topic', NMF(n_components=9, random_state=1, # alpha=.1, l1_ratio=.5)), # ])), ])), ])), # add other features here as an element in transformer list ('capitalize', Pipeline([('cap_words', CaptilizationExtractor())])), ('punctuation', PuncuationExtractor()) #('emotion', Pipeline([ # ('emotion_words', EmotionExtractor()) #])) ])), ('svc', svm.SVC()), ]) model.fit(X, y) return model labels = LabelEncoder() y = labels.fit_transform(y) if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) model = build(classifier, X_train, y_train) if verbose: print("classification Report: \n") y_pred = model.predict(X_test) print(clsr(y_test, y_pred))
def getTrainingErrors(): filename=OUTPUT_DIR+'annotatedThreads.pickle' if os.path.exists(filename): print ("%s Reading already annotated posts from disk"%(datetime.now().strftime(tsFormat))) annotated=pickle.load(open(filename)) else: print ("%s ERROR. Annotation file not foud"%(datetime.now().strftime(tsFormat))) exit() print("%s Predicting annotated"%datetime.now().strftime(tsFormat)) X_test=annotated.keys() y_test=annotated.values() y_pred = model.predict(X_test) labels=model.labels_ y_test = labels.fit_transform(y_test) print (clsr(y_test, y_pred, target_names=labels.classes_)) tn, fp, fn, tp= confusion_matrix(y_test,y_pred).ravel() print (" ---------------------") print (" Predicted") print (" 'o' \t'p'") print (" --------------") print ("Real 'o' | %s \t%s"%(tn,fp)) print (" 'p' | %s \t%s"%(fn,tp)) print print ("FPR:%.3f"%(float(fp)/(fp+tp))) print ("TPR:%.3f"%(float(tp)/(tp+fn))) print ("ACC:%.3f"%((float(tp)+tn)/(tp+fn+tn+fp)))
def build_and_eval(chat_log_file, model_path, classifier=None): if classifier is None: classifier = SGDClassifier() print("preprocessing corpus") X, y = load_corpus(chat_log_file) vectorizer = create_vectorizer() vectorizer.fit(X) X = vectorizer.transform(X, copy=False) labels = LabelEncoder() y = labels.fit_transform(y) print("training test model") y_pred = cross_val_predict(classifier, X, y, cv=5) with open('data/y_pred.pkl', 'wb') as pred_file: pickle.dump(y_pred, pred_file) print(clsr(y, y_pred)) print("training final model") model = classifier.fit(X, y) model.labels_ = labels with open(model_path, 'wb') as model_file: pickle.dump(model, model_file) return model
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True): # @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) secs = time() # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) model = build(classifier, X_train, y_train) if verbose: print("Evaluation model fit in {:0.3f} seconds".format(time() - secs)) print("Classification Report:\n") y_pred = model.predict(X_test) print(clsr(y_test, y_pred, target_names=labels.classes_)) secs = time() if verbose: print("Building complete model and saving ...") model = build(classifier, X, y) model.labels_ = labels if verbose: print("Complete model fit in {:0.3f} seconds".format(time() - secs)) if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model
def analysis(model, X_test, y_test): clf = model.named_steps['classif'] voc = model.named_steps['vect'].vocabulary_ y_pred = model.predict(X_test["Message"]) print(confusion_matrix(y_test, y_pred)) print(clsr(y_test, y_pred)) sr = feature_importances(clf, voc) if sr is not None: print("top and bottom 7 features") print(sr) return sr
def build_and_save_model(X, y, filepath): """ This function does the following: - Build a classifier (SGD) - Fit our data to the classifier - Run cross validation to test the accuracy of our model """ def build(classifier, X, y=None): """ Build a model based on our process, a vectorizer and a linear classifier """ if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ('preprocessor', DataPreProcessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) # Fit the model to our data return model # Label encode the classes we chose labels = LabelEncoder() y = labels.fit_transform(y) # Split data into train/test X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1) model = build(SGDClassifier, X_train, y_train) # Predict the results of test data and calculate accuracy y_pred = model.predict(X_test) print(clsr(y_test, y_pred, target_names=labels.classes_)) model.labels_ = labels with open(filepath, 'wb') as f: pickle.dump(model, f) return model
def evaluate_classifier(self, classifier, X, y): # Begin evaluation X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=5) model = classifier.fit(X_train, y_train) y_pred = model.predict(X_test) # *** save info for error analysis errors = [] for index in range(0, len(X_test)): if y_test[index] != y_pred[index]: errors.append("\""+X_test[index] +"\",\""+ y_test[index] +"\",\""+ y_pred[index]+"\"") str_out = "\n".join(errors) self.utilities.write_content_to_file('aspect_errors.csv', str_out) print(clsr(y_test, y_pred))
def build_model(X, y, classifier, verbose=True): @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) model, secs = build(classifier, X_train, y_train) if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) if verbose: print("Classification Report:\n") y_pred = model.predict(X_test) print(clsr(y_test, y_pred, target_names=labels.classes_)) if verbose: print("Building complete model and saving ...") model, secs = build(classifier, X, y) model.labels_ = labels.inverse_transform(model.classes_) if verbose: print("Complete model fit in {:0.3f} seconds".format(secs)) return model
def build_and_evaluate(text, leanings, classifier=SGDClassifier, verbose=True): def build(classifier, X, y=None): if isinstance(classifier, type): classifier = classifier() model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() leanings = labels.fit_transform(leanings) # Build model on training data. text_train, text_test, leanings_train, leanings_test = tts(text, leanings, test_size=0.2) #build(classifier, text_train, leanings_train) model = build(classifier, text_train, leanings_train) leanings_pred = model.predict(text_test) leanings_pred_prob = model.predict_proba(text_test) print(clsr(leanings_test, leanings_pred, target_names=labels.classes_)) # Build model on all data. model = build(classifier, text, leanings) model.labels_ = labels return leanings_test, leanings_pred_prob, model
def build(classifier, X, y=None, export=False): """ Inner build function that builds a single model. """ if isinstance(classifier, type): classifier = classifier() features = [] tfidf = Pipeline([('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))]) features.append(('tfidf', tfidf)) abstract = Pipeline([ ('abstract_feature', AbstractStats()), ('vectorizer', DictVectorizer()), # list of dicts -> feature matrix ]) features.append(('abstract', abstract)) feature_union = FeatureUnion(features) feature_extractor = Pipeline([ ('feature_union', feature_union), ]) # Label encode the targets X = feature_extractor.fit_transform(X) labels = LabelEncoder() y = labels.fit_transform(y) overall_accuracy = 0 overall_f1 = 0 overall_precision = 0 overall_recall = 0 best_accuracy = 0 best_model = None best_y_test = None best_y_pred = None num_samples = 0 stats = {} stats['accuracy'] = [] stats['f1'] = [] stats['precision'] = [] stats['recall'] = [] tests = numtests if export: tests = 1 scaler = StandardScaler(with_mean=False) X = scaler.fit_transform(X) for i in range(tests): X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) if classifier.__class__.__name__ == 'GaussianNB': classifier.fit(X_train.toarray(), y_train) y_pred = classifier.predict(X_test.toarray()) else: classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) accuracy = metrics.accuracy_score(y_test, y_pred) f1 = metrics.f1_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) stats['accuracy'].append(accuracy) stats['f1'].append(f1) stats['precision'].append(precision) stats['recall'].append(recall) if accuracy > best_accuracy: best_accuracy = accuracy best_model = classifier best_y_test = y_test best_y_pred = y_pred stats['avg_accuracy'] = sum(stats['accuracy']) / len(stats['accuracy']) stats['avg_f1'] = sum(stats['f1']) / len(stats['f1']) stats['avg_precision'] = sum(stats['precision']) / len( stats['precision']) stats['avg_recall'] = sum(stats['recall']) / len(stats['recall']) print('******Results after', num_samples, 'iterations*****') print('accuracy'.ljust(15), stats['avg_accuracy']) print('f1'.ljust(15), stats['avg_f1']) print('precision'.ljust(15), stats['avg_precision']) print('recall'.ljust(15), stats['avg_recall']) if verbose: print("Classification Report:\n") print(clsr(best_y_test, best_y_pred, target_names=labels.classes_)) return (best_model, labels, feature_extractor, stats)
def train_report(model, P, N): print("Classification Report (on training, not on test data!):\n") y_pred = model.predict(np.concatenate((P, N))) print(clsr([1. for _ in P] + [0. for _ in N], y_pred)) return
def eval_model(model, X, y): if X is not None and y is not None: y_pred = np.round(model.predict(X)) print("Accuracy:", accuracy_score(y, y_pred)) print(clsr(y, y_pred)) return
# utilities.viewTable(features_df) X = features_df.as_matrix().astype(np.float) skf = StratifiedKFold(n_splits=5, shuffle=True) skf.get_n_splits(X, y) # Split the data set in a training set (70%) and a test set (30%) for train_index, test_index, in skf.split(X, y): X_train, X_test, = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] modelRF = RandomForestClassifier(n_estimators=2000, max_depth=5, class_weight="balanced", n_jobs=16) modelRF.fit(X_train, y_train) y_pred = modelRF.predict(X_test) y_predTr = modelRF.predict(X_train) print clsr(y_test, y_pred) print clsr(y_train, y_predTr) print cross_val_score(modelRF, X_test, y_test, scoring='f1') print cohen_kappa_score(y_pred, y_test)
#Remove NaN entrys that may be present features_df = clean_dataset(features_df) # Create the X and y arrays y = features_df['Outcome'].as_matrix().astype(np.float) print len(features_df.columns) # Remove the Outcome from the feature data del features_df['Outcome'] print len(features_df.columns) X = features_df.as_matrix().astype(np.float) # Split the data set in a training set (70%) and a test set (30%) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) modelRF = joblib.load("trainedSte3Model.pkl") y_pred = modelRF.predict(X_test) y_pred = modelRF.predict(X_train) print y_pred print clsr(y_test, y_pred)
def build_and_evaluate(X, y,classifier=SGDClassifier, outpath=None, verbose=True,test_size=0.2): """ Builds a classifer for the given list of threads and targets. I uses a union of four feature extractions, namely tf-idf for heading and post content (using NLTK preprocessor), thread metadata and content metadata. X: a list or integers corresponding with threadIDs y: a list or iterable of labels, which will be label encoded. Can specify the classifier to build with: if a class is specified then this will build the model with the Scikit-Learn defaults, if an instance is given, then it will be used directly in the build pipeline. If outpath is given, this function will write the model as a pickle. If verbose, this function will print out information to the command line. """ @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): classifier = classifier() model = Pipeline([ # Extract the heading,content and numPosts ('features', FeatureExtractor()), # Use FeatureUnion to combine the different features ('union', FeatureUnion( transformer_list=[ # Pipeline for pulling bag-of-words from the thread's heading, after tokenizing and lemmatizing # Then, it aplies TF-IDF vectorization on the BoW (also it can be truncated using SVD if needed) ('heading', Pipeline([ ('selector', ItemSelector(key='numPosts_heading')), ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), #('featureselector', TruncatedSVD(n_components=20)), ])), # Pipeline for pulling bag-of-words from the thread's first post content, after tokenizing and lemmatizing # Then, it aplies TF-IDF vectorization on the BoW and truncates using SVD ('content', Pipeline([ ('selector', ItemSelector(key='content')), ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('featureselector', TruncatedSVD(n_components=50)), ])), # Pipeline for pulling ad hoc features from thhead's heading ('threadStats', Pipeline([ ('selector', ItemSelector(key='numPosts_heading')), ('stats', ThreadStats()), # returns a list of dicts ('vect', DictVectorizer()), # list of dicts -> feature matrix ])), # Pipeline for pulling ad hoc features from thread's first post content ('postStats', Pipeline([ ('selector', ItemSelector(key='content')), ('stats', PostsStats()), # returns a list of dicts ('vect', DictVectorizer()), # list of dicts -> feature matrix ])), ], # weight components in FeatureUnion transformer_weights={ 'heading': 0.8, 'content': 0.5, 'threadStats': 1.0, 'postStats': 1.0, }, )), # Use a SVC classifier on the combined features ('svc', SVC(kernel='linear')), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: print("%s Building for evaluation"%datetime.now().strftime(tsFormat)) X_train, X_test, y_train, y_test = tts(X, y, test_size=test_size) model, secs = build(classifier, X_train, y_train) if verbose: print("%s Evaluation model fit in %0.3f seconds"%(datetime.now().strftime(tsFormat),secs)) if verbose: y_pred = model.predict(X_test) print("%s Classification Report:\n"%datetime.now().strftime(tsFormat)) print(clsr(y_test, y_pred, target_names=labels.classes_)) tn, fp, fn, tp= confusion_matrix(y_test,y_pred).ravel() print (" CONFUSION MATRIX") print (" ---------------------") print (" Predicted") print (" 'o' \t'p'") print (" --------------") print ("Real 'o' |%s \t%s"%(tn,fp)) print (" 'p' |%s \t%s"%(fn,tp)) print () print ("FPR:%.3f"%(float(fp)/(fp+tp))) print ("TPR:%.3f"%(float(tp)/(tp+fn))) print ("ACC:%.3f"%((float(tp)+tn)/(tp+fn+tn+fp))) print("%s Building complete model and saving ..."%datetime.now().strftime(tsFormat)) model, secs = build(classifier, X, y) model.labels_ = labels if verbose: print("%s Complete model fit in %0.3f seconds"%(datetime.now().strftime(tsFormat),secs)) if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) if verbose: print("%s Model written out to %s"%(datetime.now().strftime(tsFormat),outpath)) return model
def build_and_evaluate(X, y, outpath=None, verbose=True): """ Builds a classifer for the given list of documents and targets in two stages: the first does a train/test split and prints a classifier report, the second rebuilds the model on the entire corpus and returns it for operationalization. X: a list or iterable of raw strings, each representing a document. y: a list or iterable of labels, which will be label encoded. Can specify the classifier to build with: if a class is specified then this will build the model with the Scikit-Learn defaults, if an instance is given, then it will be used directly in the build pipeline. If outpath is given, this function will write the model as a pickle. If verbose, this function will print out information to the command line. """ @timeit def build(Classifier, X, y=None): """ Inner build function that builds a single model. """ if Classifier == "NB": classifier = MultinomialNB() elif Classifier == "SVC": classifier = SVC() elif Classifier == "LSVC": classifier = LinearSVC() elif Classifier == "LR": classifier = LogisticRegression() elif Classifier == "NN": classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 2), random_state=1) else: classifier = SGDClassifier() model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, min_df=0.01, max_df=0.95)), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) # pdb.set_trace() print("Train size %s" % len(X_train)) print("Y train size %s" % len(y_train)) model, secs = build("NN", X_train, y_train) #model, secs = build("KK", X_train, y_train) if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) if verbose: print("Classification Report:\n") y_pred = model.predict(X_test) tot = 0 for l in range(len(y_pred)): if y_pred[l] != y_test[l]: tot = tot + 1 print("Number of test %s, numer of errors %s" % (len(y_pred), tot)) print(clsr(y_test, y_pred, target_names=labels.classes_)) accuracy = acc(y_test, y_pred) print('Accuracy: {}'.format(accuracy)) if verbose: print("Building complete model and saving ...") print("size of the total data is %s and total label %s" % (len(X), len(y))) model, secs = build("NN", X, y) #model, secs = build("KK", X, y) model.labels_ = labels if verbose: print("Complete model fit in {:0.3f} seconds".format(secs)) if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model
def build_and_evaluate(balanced, X, y, classifier=LogisticRegression, outpath=None, verbose=True): def build(balanced, classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): # classifier = classifier() if balanced == True: class_weight = 'balanced' # neg_count = 0 # neu_count = 0 # pos_count = 0 # for label in y: # if label == 0: # neg_count += 1 # elif label == 1: # neu_count += 1 # elif label == 2: # pos_count += 1 # # if(len(set(y))) == 3: # minimum = min(neg_count, neu_count, pos_count) # class_weight = {0: minimum/neg_count, 1: minimum/neu_count, 2: minimum/pos_count} # elif (len(set(y))) == 2: # pos_count = neu_count # minimum = min(neg_count, pos_count) # class_weight = {0: minimum/neg_count, 1: minimum/pos_count } # print('0:', neg_count, '1:', neu_count, '2:', pos_count) # print(class_weight) else: class_weight = None classifier = classifier(multi_class='multinomial', solver='saga', class_weight=class_weight) # classifier = classifier(max_iter=1000, class_weight = class_weight) # classifier = classifier(class_weight=class_weight, C=1) # classifier = classifier(class_weight = class_weight) model = Pipeline([ ('preprocessor', NLTKPreprocessor()), # ('vectorizer', CountVectorizer(tokenizer=identity,preprocessor=None,lowercase=None,ngram_range =(1,2))), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=None, ngram_range=(1, 2))), # ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')), # ('feature_selection', SelectPercentile(percentile=50)), ('feature_selection', SelectPercentile(score_func=chi2, percentile=90)), # ('to_dense', DenseTransformer()), # ('standardization', StandardScaler(with_mean=False)), # ('feature_selection', VarianceThreshold(threshold=(.8 * (1 - .8)))), ('classifier', classifier), ]) # parameters = { # # 'vectorizer__max_features': [85000,100000,125000,150000] # # 'vectorizer__max_df': [0.5,0.6,0.7,0.8] # # 'classifier__loss': ['log', 'modified_huber', 'squared_hinge', 'perceptron'] # 'classifier__multi_class': ['multinomial', 'ovr'], # 'classifier__solver': ['newton-cg', 'sag', 'saga', 'lbfgs'] # } # grid = GridSearchCV(model,param_grid=parameters) # grid.fit(X,y) # # print("Best: %f using %s" % (grid.best_score_, # grid.best_params_)) # means = grid.cv_results_['mean_test_score'] # stds = grid.cv_results_['std_test_score'] # params = grid.cv_results_['params'] # for mean, stdev, param in zip(means, stds, params): # print("%f (%f) with: %r" % (mean, stdev, param)) # return grid model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0) model = build(balanced, classifier, X_train, y_train) y_pred = model.predict(X_test) y_actual = pd.Series(y_test, name='Actual') y_predicted = pd.Series(y_pred, name='Predicted') df_confusion = pd.crosstab(y_actual, y_predicted, rownames=['Actual'], colnames=['Predicted'], margins=True) if verbose: print("Confusion Matrix:\n") print(df_confusion) if verbose: print("Classification Report:\n") print(clsr(y_test, y_pred, target_names=labels.classes_, digits=4)) print( accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) * 100) # seed = 7 # kfold = StratifiedKFold(n_splits=5) # scores = cross_val_score(model, X_train, y_train, cv=kfold) # print(scores) # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) if verbose: print("Building complete model and saving ...") model = build(balanced, classifier, X, y) model.labels_ = labels if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model
array = [] label = [] #Load trained model model_dir = "./" model_file = "model.pickle" model_name = model_dir + model_file with open(model_name, 'rb') as f: model = pickle.load(f) #Load test data filenames = ['test_BGS.neg', 'test_BGS.pos'] with open(filenames[0], "r") as pos, open(filenames[1]) as neg: for line in pos: array.append(line) label.append(1) for line in neg: label.append(0) array.append(line) labels = LabelEncoder() y = labels.fit_transform(label) y_pred = model.predict(array) tot = 0 np.asarray(label) for l in range(len(y_pred)): if y_pred[l] != label[l]: tot = tot + 1 print(clsr(label, y_pred)) accuracy = acc(label, y_pred) print("Accuracy:{}".format(accuracy))
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True): """ Builds a classifer for the given list of documents and targets in two stages: the first does a train/test split and prints a classifier report, the second rebuilds the model on the entire corpus and returns it for operationalization. X: a list or iterable of raw strings, each representing a document. y: a list or iterable of labels, which will be label encoded. Can specify the classifier to build with: if a class is specified then this will build the model with the Scikit-Learn defaults, if an instance is given, then it will be used directly in the build pipeline. If outpath is given, this function will write the model as a pickle. If verbose, this function will print out information to the command line. """ @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): classifier = classifier() #print ('TfidfVectorizer',TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)) model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', classifier), ]) model.fit(X, y) print('model created') return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) model, secs = build(classifier, X_train, y_train) if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) if verbose: print("Classification Report:\n") y_pred = model.predict(X_test) #Evaluating the model print(clsr(y_test, y_pred, target_names=labels.classes_)) if verbose: print("Building complete model and saving ...") model, secs = build(classifier, X, y) model.labels_ = labels if verbose: print("Complete model fit in {:0.3f} seconds".format(secs)) if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model
def build_and_evaluate(X, y, n=None, classifier=LogisticRegressionCV, outpath=None, verbose=True, multiclass=False): """ Builds a classifer for the given list of documents and targets in two stages: the first does a train/test split and prints a classifier report, the second rebuilds the model on the entire corpus and returns it for operationalization. X: a list or iterable of raw strings, each representing a document. y: a list or iterable of labels, which will be label encoded. Can specify the classifier to build with: if a class is specified then this will build the model with the Scikit-Learn defaults, if an instance is given, then it will be used directly in the build pipeline. If outpath is given, this function will write the model as a pickle. If verbose, this function will print out information to the command line. """ @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): if multiclass: classifier = classifier(cv=10, random_state=0, max_iter=1000, solver='newton-cg', multi_class="multinomial") else: classifier = classifier(cv=10, random_state=0, max_iter=1000, solver='newton-cg') # gridsearch_pipe = Pipeline([ # # ('preprocessor', TextNormalizer_lemmatize()), # ('vectorizer', TfidfVectorizer( # tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1,2)) # ), # ('classifier', classifier) # ]) # maxdf = [0.85, .90] # mindf = (4, 3, 2) # nfeat = [ 13500, 13600, 13700] # ngrams = [(1, 1), (1, 2), (1,3)] # param_grid = {'vectorizer__max_df':maxdf, 'vectorizer__min_df':mindf, 'vectorizer__ngram_range':ngrams, # 'vectorizer__max_features':nfeat} # C = np.logspace(0, 4, 10) # penalty = [ 'l1','l2' ] # param_grid = {'classifier__C':C, 'classifier__penalty':penalty} # grid_search = GridSearchCV(gridsearch_pipe, param_grid, cv=6) # grid_search.fit(X, y) # best_param = grid_search.best_params_ # print(best_param) # vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, # max_df=best_param['vectorizer__max_df'], min_df=best_param['vectorizer__min_df'], # ngram_range=best_param['vectorizer__ngram_range'], max_features=best_param['vectorizer__max_features']) # classifier = LogisticRegression( random_state=0, max_iter=1000, penalty=best_param['classifier__penalty'], C=best_param['classifier__C']) # vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1,2), max_features=13000, # max_df=0.85, min_df=2 ) #form vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2), max_features=13500, max_df=0.85, min_df=23) model = Pipeline([ # ('preprocessor', TextNormalizer_lemmatize()), ('vectorizer', vectorizer), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if n: if verbose: print("splitting test and test set by: " + str(n)) n_samples = len(y) indicies = np.arange(n_samples) X_train, X_test, y_train, y_test, idx_train, idx_test = tts( X, y, indicies, test_size=n, stratify=y) # X_train, X_test, y_train, y_test = X[:n], X[n:], y[:n], y[n:] from collections import Counter print('y_train', Counter(y_train)) model, secs = build(classifier, X_train, y_train) model.labels_ = labels if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) y_pred = model.predict(X_test) if verbose: print("Classification Report:\n") print(clsr(y_test, y_pred, target_names=labels.classes_)) print(cm(y_test, y_pred)) print('acc', accuracy_score(y_test, y_pred)) print('f1', f1_score(y_test, y_pred, average='weighted')) if verbose: print("Evaluation of naive prediction ...") y_naive = [0] * len(y_test) print(type(y_test)) print('acc naive', accuracy_score(y_test, y_naive)) else: if verbose: print("Building for evaluation with full set") model, secs = build(classifier, X, y) model.labels_ = labels if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) y_pred = model.predict(X) if verbose: print("Classification Report:\n") print(clsr(y, y_pred, target_names=labels.classes_)) print(cm(y, y_pred)) print(accuracy_score(y, y_pred)) if verbose: print("Evaluation of naive prediction ...") y_naive = [0] * len(y) print(type(y)) print('acc naive', accuracy_score(y, y_naive)) if verbose: print("Complete model fit in {:0.3f} seconds".format(secs)) if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model, y_pred, idx_test
def build_and_evaluate(body, tag, outpath=None): def build(body, tag): model = Pipeline([ ('preprocessor', Preprocessor()), ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', MultinomialNB()), ]) model.fit(body, tag) return model print("Encoding labels. . .") labels = LabelEncoder() tags = labels.fit_transform(tag) print("Spitting training and testing data set . . .") body_train, body_test, tag_train, tag_test = tts(body, tags) print("Building model . . .") model = build(body_train, tag_train) print("Classification report:\n") y_prediction = model.predict(body_test) results = clsr(tag_test, y_prediction, target_names=labels.classes_) print(results) #Let's use Cross Validation and the GridSearch to see if we can improve upon our accuracy. #Define the paramaters to be fine tuned paramaters = { 'vectorizer__sublinear_tf': (True, False), 'vectorizer__min_df': (4, 5, 7, 10), 'classifier__alpha': (0.5, 0.2, 0.1, 0.01, 0.001) } #Create the GridSearchCV from our existing model grid_search = GridSearchCV(model, paramaters, cv=5, n_jobs=-1) grid_search.fit(body_train, tag_train) #GridSearch returns a cv_results paramater but also has several best_ attributes that aren't included in the cv_result dictionary. Lets make our own dic. data = { "best_estimator": grid_search.best_estimator_, "best_score": grid_search.best_score_, "best_index": grid_search.best_index_, "best_params": grid_search.best_params_, } # #lets print out the results and take a look at how we're doing # print("The best params:") # print(data) # print("The Cross validation Result:") # print(grid_search.cv_results_) # #Double check performance with the testing porition of our data set. # tag_real, body_predict = tag_test, grid_search.predict(body_test) # result = clsr(tag_real,body_predict) # print (result) #We have the optimal paramaters so lets grab the best estimator and refit it to the full data set and retest accuracy. model = grid_search.best_estimator_ tag_real, body_predict = tag_test, grid_search.predict(body_test) result = clsr(tag_real, body_predict) print(result) return model
'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag) TRAIN_DATA = "C:\\Users\\Shobha Rani\\Desktop\\shobha\\NLP_544\\Project\\Data\\Train" TEST_DATA = "C:\\Users\\Shobha Rani\\Desktop\\shobha\\NLP_544\\Project\\Data\\Test" train = load_files(TRAIN_DATA) X_train = train.data y_train = train.target test = load_files(TEST_DATA) X_test = test.data y_test = test.target model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer(decode_error='ignore', tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2))), ('classifier', RandomForestClassifier(n_estimators=100)), ]) model.fit(X_train, y_train) print("Classification Report:\n") y_pred = model.predict(X_test) print(clsr(y_test, y_pred, target_names=test.target_names)) print("COMPLETE")
def build_and_evaluate(X_train, y_train, X_test, y_test, X_val, y_val, args, classifier=svm.LinearSVC(), parameters=None): def build(classifier, X_train, y_train, X_test, y_test): """ Inner build function that builds a single model using only ngrams. """ model = Pipeline([ ('vect', TfidfVectorizer(preprocessor=NLTKPreprocessor(stem=True), lowercase=True, stop_words=None, ngram_range=(1, 2), sublinear_tf=True)), ('classif', classifier), ]) if parameters: clf = GridSearchCV(model, parameters) clf.fit(X_train, y_train) means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() return clf else: X = np.append(X_train, X_test, axis=0) y = np.append(y_train, y_test, axis=0) kf = KFold(n_splits=4, shuffle=True) for train, test in kf.split(X): model.fit(X[train], y[train]) print(model.score(X[test], y[test])) model.fit(X_train, y_train) print(model.score(X_test, y_test)) return model labels, y_train, y_test, y_val = generate_labels(y_train, y_test, y_val) #labels = LabelEncoder() #try: # y_test = labels.fit_transform(y_test) # y_val = labels.fit_transform(y_val) #except: # type, value, tb = sys.exc_info() # traceback.print_exc() # pdb.post_mortem(tb) train_dependency_relations = pickle.load( open(args.data + "train_dependency_rel.p", "rb")) test_dependency_relations = pickle.load( open(args.data + "test_dependency_rel.p", "rb")) train_dependency_tree = pickle.load( open(args.data + "train_dependency_tree.p", "rb")) test_dependency_tree = pickle.load( open(args.data + "test_dependency_tree.p", "rb")) Xtr, Xte, ytr, yte = X_train, X_test, y_train, y_test # Xtr, ytr, train_dependency_relations, train_dependency_tree = balance_data(Xtr, ytr, train_dependency_relations, train_dependency_tree,labels,sampling = 2) prep = NLTKPreprocessor(stem=True) vect = TfidfVectorizer(preprocessor=prep, lowercase=True, stop_words=None, ngram_range=(1, 2)) vect.fit_transform(Xtr["Message"]) X_tr_mat, f_tr_labels = generate_features( vect, Xtr["Message"], train_dependency_tree, train_dependency_relations, args.data + 'train', ) # X_te_mat, f_te_labels = generate_features(vect, Xte["Message"], test_dependency_relations, args.data+'tune') X_val_mat, f_val_labels = generate_features( vect, X_val["Message"], test_dependency_tree, test_dependency_relations, args.data + 'test', ) # write train to weka for feature analysis if args.to_weka: vectors_train = np.array( gen_msg_features(Xtr["Message"], args.data + 'train')) vectors_test = np.array( gen_msg_features(Xte["Message"], args.data + 'tune')) vectors_val = np.array( gen_msg_features(X_val["Message"], args.data + 'test')) header_train = f_tr_labels.tolist() header_train.append(labels.classes_) sk_to_weka(vectors_train[1], ytr, header_train,\ '_'.join(labels.classes_)+'_train_features.arff') sk_to_weka(vectors_test[1], y_test, header_train,\ '_'.join(labels.classes_)+'_tune_features.arff') sk_to_weka(vectors_val[1], y_val, header_train,\ '_'.join(labels.classes_)+'_test_features.arff') cls = classifier if args.gridsearch: logger.info("Performing GridSearch on train data") clf = GridSearchCV(estimator=cls, param_grid=parameters) clf.fit(X_tr_mat, ytr) best_parameters = clf.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) if args.clf >= 4: X_tr_mat = X_tr_mat.toarray() X_te_mat = X_te_mat.toarray() X_val_mat = X_val_mat.toarray() if args.cv != 0: cross_validate(cls, X_tr_mat, ytr, args) # Three level hierarchy classifiers # Classifying between general and non-general ytr_gen = np.array( [1 if labels.inverse_transform(i) == 'general' else 0 for i in ytr]) y_val_gen = np.array( [1 if labels.inverse_transform(i) == 'general' else 0 for i in y_val]) print("Classifying general cases : \n\n") X_tr_mat_o, ytr_gen_o = imbalance_sampling(X_tr_mat, ytr_gen, 0) cls.fit(X_tr_mat_o, ytr_gen_o) y_pred_gen = (cls.predict(X_val_mat)) y_pred_overall = np.array([ labels.transform(['general'])[0] if i == 1 else -1 for i in y_pred_gen ]) print(confusion_matrix(y_val_gen, y_pred_gen)) scores = clsr(y_val_gen, y_pred_gen) scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\ scores.split("\n"))) #scores[0] = '\t' + scores[0] scores[-2] = '\t' + scores[-2] scores = '\n'.join(scores) print(scores) missclassified(X_val["Message"], y_val_gen, y_pred_gen, "general", "nonGeneral") # Classifying between (td_event,td_non_event) and (emergency,urgent) X_tr_mat_non_gen = X_tr_mat[( ytr != labels.transform(['general'])[0]).nonzero()[0], :] ytr_non_gen = ytr[ytr != labels.transform(['general'])[0]] ytr_non_gen = np.array([ 1 if (labels.inverse_transform(i) == 'td_event' or labels.inverse_transform(i) == 'td_non_event') else 0 for i in ytr_non_gen ]) X_val_mat_non_gen = X_val_mat[(y_pred_gen != 1).nonzero()[0], :] y_val_non_gen = y_val[y_pred_gen != 1] y_val_non_gen = np.array([ 1 if (labels.inverse_transform(i) == 'td_event' or labels.inverse_transform(i) == 'td_non_event') else 0 for i in y_val_non_gen ]) print("Classifying non-general cases : \n\n") X_tr_mat_non_gen_o, ytr_non_gen_o = imbalance_sampling( X_tr_mat_non_gen, ytr_non_gen, 0) cls.fit(X_tr_mat_non_gen_o, ytr_non_gen_o) y_pred_non_gen = (cls.predict(X_val_mat_non_gen)) print(confusion_matrix(y_val_non_gen, y_pred_non_gen)) scores = clsr(y_val_non_gen, y_pred_non_gen) scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\ scores.split("\n"))) #scores[0] = '\t' + scores[0] scores[-2] = '\t' + scores[-2] scores = '\n'.join(scores) print(scores) # Classifying between td_event and td_non_event X_tr_mat_td = X_tr_mat[np.array([ y in labels.transform(['td_event', 'td_non_event']) for y in ytr ]).nonzero()[0]] ytr_td = ytr[[ y in labels.transform(['td_event', 'td_non_event']) for y in ytr ]] ytr_td = np.array([ 1 if labels.inverse_transform(i) == 'td_event' else 0 for i in ytr_td ]) X_val_mat_td = X_val_mat_non_gen[(y_pred_non_gen == 1).nonzero()[0], :] y_val_td = (y_val[y_pred_gen != 1])[y_pred_non_gen == 1] y_val_td = np.array([ 1 if labels.inverse_transform(i) == 'td_event' else 0 for i in y_val_td ]) td_pred_indices = (y_pred_gen != 1).nonzero()[0][y_pred_non_gen == 1] print("Classifying td_event and td_non_event : \n\n") X_tr_mat_td_o, ytr_td_o = imbalance_sampling(X_tr_mat_td, ytr_td, 0) cls.fit(X_tr_mat_td_o, ytr_td_o) y_pred_td = (cls.predict(X_val_mat_td)) y_pred_td_label = np.array([ labels.transform(['td_event'])[0] if i == 1 else labels.transform(['td_non_event'])[0] for i in y_pred_td ]) y_pred_overall[td_pred_indices] = y_pred_td_label print(confusion_matrix(y_val_td, y_pred_td)) scores = clsr(y_val_td, y_pred_td) scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\ scores.split("\n"))) #scores[0] = '\t' + scores[0] scores[-2] = '\t' + scores[-2] scores = '\n'.join(scores) print(scores) # Classifying between emergency and urgent X_tr_mat_emer = X_tr_mat[np.array([ y in labels.transform(['emergency', 'urgent']) for y in ytr ]).nonzero()[0]] ytr_emer = ytr[[ y in labels.transform(['emergency', 'urgent']) for y in ytr ]] ytr_emer = np.array([ 1 if labels.inverse_transform(i) == 'emergency' else 0 for i in ytr_emer ]) X_val_mat_emer = X_val_mat_non_gen[(y_pred_non_gen == 0).nonzero()[0], :] y_val_emer = (y_val[y_pred_gen != 1])[y_pred_non_gen == 0] y_val_emer = np.array([ 1 if labels.inverse_transform(i) == 'emergency' else 0 for i in y_val_emer ]) emer_pred_indices = (y_pred_gen != 1).nonzero()[0][y_pred_non_gen == 0] print("Classifying emergency and urgent : \n\n") X_tr_mat_emer_o, ytr_emer_o = imbalance_sampling(X_tr_mat_emer, ytr_emer, 0) cls.fit(X_tr_mat_emer_o, ytr_emer_o) y_pred_emer = (cls.predict(X_val_mat_emer)) y_pred_emer_label = np.array([ labels.transform(['emergency'])[0] if i == 1 else labels.transform(['urgent'])[0] for i in y_pred_emer ]) y_pred_overall[emer_pred_indices] = y_pred_emer_label print(confusion_matrix(y_val_emer, y_pred_emer)) scores = clsr(y_val_emer, y_pred_emer) scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\ scores.split("\n"))) #scores[0] = '\t' + scores[0] scores[-2] = '\t' + scores[-2] scores = '\n'.join(scores) print(scores) print("Overall classification scores : ") print(confusion_matrix(y_val, y_pred_overall)) scores = clsr(y_val, y_pred_overall) scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\ scores.split("\n"))) #scores[0] = '\t' + scores[0] scores[-2] = '\t' + scores[-2] scores = '\n'.join(scores) print(scores) #Three level hierarchy classifiers ends here # X_tr_mat, ytr = imbalance_sampling(X_tr_mat, ytr,2) # lda = LinearDiscriminantAnalysis() # X_tr_mat = lda.fit_transform(X_tr_mat.toarray(), ytr) cls.fit(X_tr_mat, ytr) y_pred = (cls.predict(X_val_mat)) # y_pred = postProcess(y_pred,X_val["Message"],labels) print(confusion_matrix(y_val, y_pred)) conf_f_name = '_'.join(labels.classes_) + '_cfm.tsv' np.savetxt(conf_f_name, confusion_matrix(y_val, y_pred), delimiter='\t',\ fmt="%2.1d", header='\t'.join(labels.classes_)) scores = clsr(y_val, y_pred) scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\ scores.split("\n"))) #scores[0] = '\t' + scores[0] scores[-2] = '\t' + scores[-2] scores = '\n'.join(scores) print(scores) with open('_'.join(labels.classes_) + '_scores.tsv', 'w') as f: f.write('\t' + scores) if args.save: fitted_model = {'vect': vect, 'cls': cls} logger.info("Saving model") pickle.dump(fitted_model,\ open('_'.join(sorted(labels.classes_))+'_'+str(cls)[0:10]+'.model', 'wb')) misclassifications_class(cls, y_pred, X_val_mat, y_val, X_val, labels, args.level, True) if args.with_graph: plot_feat(vectors_val[1], vectors_val[0], labels, y_val, 'Average Feature value for each class') if args.roc: roc_auc(X_val_mat, y_val, cls)
def write_txt(data, name): data = ''.join(str(word) for word in data) file = open(base_dir + name, 'w') file.write(data) file.close() #preprocessing(inpath=path, name="data.csv", mix=True) #print("Preprocessing the Test Data") #preprocessing(inpath=test_path, name="imdb_te.csv", mix=True) [xtrain, ytrain] = retrieve_data("reviews.csv") #[xtest, ytest] = retrieve_data("imdb_te.csv") xtrain, xtest, ytrain, ytest = tts(xtrain, ytrain, test_size=0.5) labels = LabelEncoder() ytrain = labels.fit_transform(ytrain) ytest = labels.fit_transform(ytest) print("--------------Vectorizing on Sample Data----------------") tfidf_vector = tfidf_process(xtrain) with open(base_dir + "vectorizer", 'wb') as f: pickle.dump(tfidf_vector, f) xtrain_tf = tfidf_vector.transform(xtrain) print("--------------Vectorizing on Test Data----------------") xtest_tf = tfidf_vector.transform(xtest) ypred = stochastic_descent(xtrain_tf, ytrain, xtest_tf) print(ypred, labels.classes_) write_txt(ypred, name="output.txt") print("\nAccuracy Score : ", accuracy_score(ytest, ypred)) print("\nConfusion : \n", confusion_matrix(ytest, ypred)) print("\nCLSR----------------------\n", clsr(ytest, ypred, target_names=labels.classes_))
def build_and_evaluate(self, x, y, preprocessor=None, classifier=None, feature_booster=None, outpath=None, verbose=True, test_size=0.2, print_report=True, Kfold_test=True, n_splits=5): # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: log.info("Building for evaluation") if Kfold_test: log.info("Performing K-fold test...") skf = StratifiedKFold(n_splits=n_splits, shuffle=False, random_state=0) split_count = 1 average_accuracy = 0 accuracy_list = [] average_precision = 0 average_recall = 0 avearage_f1_score = 0 for train_index, test_index in skf.split(x, y): # print("TRAIN:", train_index, "TEST:", test_index) x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] model, secs = self.build(x=x_train, y=y_train, preprocessor=preprocessor, classifier=classifier, feature_booster=feature_booster) if verbose: log.info( "Evaluation model fit in {:0.3f} seconds".format(secs)) log.info("Classification Report for fold-{}:\n".format( split_count)) y_pred = model.predict(x_test) report = clsr(y_test, y_pred, target_names=labels.classes_) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) f1_s = f1_score(y_true=y_test, y_pred=y_pred, labels=labels.classes_, average="weighted") precision = precision_score(y_true=y_test, y_pred=y_pred, labels=labels.classes_, average="weighted") recall = recall_score(y_true=y_test, y_pred=y_pred, labels=labels.classes_, average="weighted") confusion_mat = confusion_matrix(y_true=y_test, y_pred=y_pred) fpr, tpr, _ = roc_curve(y_test, y_pred) area_under_curve = auc(fpr, tpr) print("Area under curve:", area_under_curve) accuracy_list.append(accuracy) # plt.plot(fpr, tpr) # plt.step(recall_, precision_, color='b', alpha=0.2, where='post') # plt.fill_between(recall_, precision_, step='post', alpha=0.2, color='b') # plt.xlabel('Recall') # plt.ylabel('Precision') # plt.show() if print_report: print(report) print("Accuracy: {}".format(accuracy)) print("F1-Score: {}".format(f1_s)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("Confusion-Matrix: ") print(confusion_mat) average_accuracy = (average_accuracy * (split_count - 1) + (accuracy)) / (split_count) avearage_f1_score = (avearage_f1_score * (split_count - 1) + (f1_s)) / (split_count) average_precision = (average_precision * (split_count - 1) + (precision)) / (split_count) average_recall = (average_recall * (split_count - 1) + (recall)) / (split_count) split_count += 1 std_dev_accuracy = np.std(accuracy_list) if print_report: print("\n************ K-fold results ************") print("Average-Accuracy: {}".format(average_accuracy)) print("Average-F1-Score: {}".format(avearage_f1_score)) print("Average-Precision: {}".format(average_precision)) print("Average-Recall: {}".format(average_recall)) print("*******************************************") else: x_train, x_test, y_train, y_test = tts(x, y, test_size=test_size) if verbose: log.info("Completed train test split") model, secs = self.build(x=x_train, y=y_train, preprocessor=preprocessor, classifier=classifier, feature_booster=feature_booster) if verbose: log.info( "Evaluation model fit in {:0.3f} seconds".format(secs)) log.info("Classification Report:\n") y_pred = model.predict(x_test) report = clsr(y_test, y_pred, target_names=labels.classes_) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) f1_s = f1_score(y_true=y_test, y_pred=y_pred, labels=labels.classes_, average="weighted") precision = precision_score(y_true=y_test, y_pred=y_pred, labels=labels.classes_, average="weighted") recall = recall_score(y_true=y_test, y_pred=y_pred, labels=labels.classes_, average="weighted") confusion_mat = confusion_matrix(y_true=y_test, y_pred=y_pred) # fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred) fpr, tpr, _ = roc_curve(y_test, y_pred) area_under_curve = auc(fpr, tpr) print(area_under_curve) # plt.step(recall_, precision_, color='b', alpha=0.2, where='post') # plt.fill_between(recall_, precision_, step='post', alpha=0.2, color='b') if print_report: print(report) print("Accuracy: {}".format(accuracy)) print("F1-Score: {}".format(f1_s)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("Confusion-Matrix: ") print(confusion_mat) if verbose: log.info("Building complete model and saving ...") model, secs = self.build(x=x, y=y, preprocessor=preprocessor, classifier=classifier, feature_booster=feature_booster) model.labels_ = labels if verbose: log.info("Complete model fit in {:0.3f} seconds".format(secs)) if outpath: with open(outpath, 'wb') as doc: pickle.dump(model, doc) log.info("Model written out to {}".format(outpath)) json_path = outpath.split(".")[0] + ".json" # import ipdb; ipdb.set_trace() with open(json_path, 'w') as outfile: model_json = model.named_steps["classifier"].get_params() model_json.update( {"model_name": str(type(model.named_steps["classifier"]))}) json.dump(model_json, outfile) return model, average_accuracy, std_dev_accuracy