def fit(self): with open('rev_lab.pickle', 'rb') as f: reviews = pickle.load(f) labels = pickle.load(f) vectorizer = CountVectorizer(min_df=2, tokenizer=word_tokenize) counts = vectorizer.fit_transform(reviews) # fitted with train data transformer = TfidfTransformer() tf_idf = transformer.fit_transform(counts) # the same as vectorizer X_train, X_test, y_train, y_test = train_test_split(tf_idf, labels, train_size=0.9, random_state=42) classifier = LogisticRegression(C=7, solver='liblinear') classifier.fit(X_train, y_train) print(accuracy_score(y_test, classifier.predict(X_test))) print(recall_score(y_test, classifier.predict(X_test))) with open('regression.pickle', 'wb') as f: pickle.dump(classifier, f) pickle.dump(vectorizer, f) pickle.dump(transformer, f)
def func1(): # n_features 设置多少个特征,多维的特征 # n_informative 有几个属性是有特别关联的 # n_targets 有多少个输出 #X,y = datasets.make_regression(n_samples=100, n_features=100, n_informative=10, n_targets=1, noise=0.0, bias=0.0, random_state=None) X, y = datasets.make_regression(n_samples=10, n_features=2) print('line X ', X) print('line y ', y) #plt.scatter(X[:,0],y) #plt.show() lm = LinearRegression() lm.fit(X, y) X_test = [[0.69803203, 0.62000084]] y_predict = lm.predict(X_test) print(y_predict) X, y = datasets.make_moons(10, noise=0.2) print('logis ', X) print('logis ', y) #X_train,X_test,y_train,y_test = train_test_split(X,y) logis_regre = LogisticRegression() #y = [[-140.66643209],[114.7982953],[103.11834249],[-177.27466722],[24.48139711],[-30.44916242],[38.96288527],[-57.62121771],[82.14111136],[90.54966151]] logis_regre.fit(X, y) print(logis_regre.predict(X_test)) logis_regre = LogisticRegressionCV() logis_regre.fit(X, y) print(logis_regre.predict(X_test))
def train(dirname1, dirname2, dirname3, dirname4): s_list_train, w_list_train, label_train = load_train_set( dirname1, dirname2) s_list_test, w_list_test, label_test = load_train_set(dirname3, dirname4) classifer_single = LogisticRegression() classifer_word = LogisticRegression() classifer_single.fit(s_list_train, label_train) classifer_word.fit(w_list_train, label_train) predictions_s = classifer_single.predict(s_list_test) predictions_w = classifer_word.predict(w_list_test) length = len(predictions_s) count1, count2 = 0, 0 for i in range(length): if predictions_s[i] == label_test[i]: count1 += 1 if predictions_w[i] == label_test[i]: count2 += 1 print("the accuracy of classifier for single: " + str(count1 / length) + "\nthe accuracy of classifier for word: " + str(count2 / length)) return predictions_s, predictions_w, label_train, label_test
def lr_training_and_test(X_train, X_test, y_train, y_test): print 'model: logistic regression.' model = LogisticRegression() model.fit(X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) y_train_pred_prob = model.predict_proba(X_train)[:, 1] y_test_pred_prob = model.predict_proba(X_test)[:, 1] evaluate_model(y_train, y_train_pred, y_train_pred_prob, y_test, y_test_pred, y_test_pred_prob) return model
def answer(test_path): import warnings warnings.filterwarnings("ignore") import time t0 = time.time() from learning import process_test_data, training_data, training_answers from sklearn.cluster.k_means_ import KMeans from sklearn.linear_model.logistic import LogisticRegression test_data = process_test_data(test_path) km = KMeans() km.fit(training_data, training_answers) myNum = km.predict(test_data).item() numX = [1, 2, 4, 2, 7, 0, 2, 7, 4, 3, 2, 1, 4, 5, 5, 1, 3, 0, 4, 2] numbers = [[num] for num in numX] letX = [ 'a', 'a', 'o', 'a', 'o', 'o', 'a', 'a', 'o', 'a', 'a', 'o', 'a', 'o', 'o', 'o', 'a', 'a', 'o', 'a' ] letters = [[letter] for letter in letX] lr = LogisticRegression() lr.fit(numbers, letters) ans = lr.predict(myNum).item() t1 = time.time() return [ans, t1 - t0]
def mlogistic(): X = [] # 前三行作为输入样本 X.append("f**k you") X.append("f**k you all") X.append("hello everyone") # 后两句作为测试样本 X.append("f**k me") X.append("hello boy") # y为样本标注 y = [1,1,0] vectorizer = TfidfVectorizer() # 取X的前三句作为输入做tfidf转换 X_train = vectorizer.fit_transform(X[:-2]) print X_train # 取X的后两句用“上句生成”的tfidf做转换 X_test = vectorizer.transform(X[-2:]) print X_test # 用逻辑回归模型做训练 classifier = LogisticRegression() classifier.fit(X_train, y) # 做测试样例的预测 predictions = classifier.predict(X_test) print predictions
def score(id): data = [] mark = [] with open(id, 'r', encoding='utf-8_sig') as f: csv_reader = csv.reader(f) for x in csv_reader: data.append(list(map(float, x[0:-1]))) mark.append(float(x[-1])) acc = [] auc = [] f1 = [] for i in range(10): X_train, X_test, y_train, y_test = cross_validation.train_test_split( data, mark, test_size=0.05, random_state=i) clf = LogisticRegression(C=4.8, random_state=1113) clf.fit(X_train, y_train) # print('准确率:',clf.score(X_test, y_test)) acc.append(round(clf.score(X_test, y_test), 3)) y_pred = clf.predict(X_test) # print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred)) auc.append(round(metrics.roc_auc_score(y_test, y_pred), 3)) # print ('F1-score: %.4f' %metrics.f1_score(y_test,y_predict)) f1.append(round(metrics.f1_score(y_test, y_pred), 3)) acc.append(round(sum(acc) / len(acc), 3)) auc.append(round(sum(auc) / len(auc), 3)) f1.append(round(sum(f1) / len(f1), 3)) return [auc, acc, f1]
def run_logistic_regression_multiclass_classification(train, train_labels, validate, validate_labels): logisticReg = LogisticRegression() logisticReg.fit(train, train_labels) predicted_labels = logisticReg.predict(validate) return metrics.accuracy_score(validate_labels, predicted_labels)
def logic_pca_standard(y, n): # 逻辑回归+降维+标准化 pa = PCA(n_components=n) data = pa.fit_transform(train) # 分割数据 x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=24) # 标准化 std = StandardScaler() print(std) x_train = std.fit_transform(x_train) x_test = std.transform(x_test) # estimator logic = LogisticRegression() logic.fit(x_train, y_train) # 预测 pre_score = logic.score(x_test, y_test) print("准确率(逻辑回归+降维+标准化):{}".format(pre_score)) print( "精确率和召回率:", classification_report(y_test, logic.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) # 输出概率 predictions = logic.predict_proba(x_test) # Compute Receiver operating characteristic (ROC) fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value))
class LogisticRegressionImpl(): def __init__(self, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=None): self._hyperparams = { 'penalty': penalty, 'dual': dual, 'tol': tol, 'C': C, 'fit_intercept': fit_intercept, 'intercept_scaling': intercept_scaling, 'class_weight': class_weight, 'random_state': random_state, 'solver': solver, 'max_iter': max_iter, 'multi_class': multi_class, 'verbose': verbose, 'warm_start': warm_start, 'n_jobs': n_jobs} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def method(self): lr = LogisticRegression() X, y = self.split() lr.fit(X, y) y_pred = lr.predict(X) print('Train data accuracy:', accuracy_score(y, y_pred)) self.model = lr
def lrModel(xtrain, xtest, y): model = LogisticRegression() model.fit(xtrain, y) yHat = model.predict(xtest) return yHat
def Logistic_Regression_cls(preprocessing='PCA', pre_kernel='rbf', plot_result=False): if preprocessing == 'PCA': X, y = use_PCA('iris_data.txt') elif preprocessing == 'KPCA': X, y = use_KPCA('iris_data.txt', kernel=pre_kernel) elif preprocessing == 'LDA': X, y = use_LDA('iris_data.txt') elif preprocessing == 'None': loader = datasets.load_iris() X, y = loader['data'], loader['target'] else: print( 'Please choose a data preprocessing method from the following method:\n' ) print('1.PCA, 2.KPCA, 3.LDA, 4.None') return X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8) classifier = LogisticRegression(multi_class="multinomial", solver="newton-cg") classifier.fit(X_train, y_train) predict = classifier.predict(X_test) total = np.size(y_test) correct = 0 for index, label in enumerate(y_test): if predict[index] == label: correct += 1 accuracy = correct / total print("正确样本数为{}, 正确率为{:.4f}".format(correct, accuracy)) if plot_result and preprocessing != 'None': fig1 = plt.subplot(1, 2, 1) fig1.set_title('raw data with label') for idx, y in enumerate(y_test): if y == 0: fig1.scatter(X_test[idx][0], X_test[idx][1], c='r') if y == 1: fig1.scatter(X_test[idx][0], X_test[idx][1], c='g') if y == 2: fig1.scatter(X_test[idx][0], X_test[idx][1], c='b') fig2 = plt.subplot(1, 2, 2) fig2.set_title('classification result') for idx, label in enumerate(predict): if label == 0: fig2.scatter(X_test[idx][0], X_test[idx][1], c='r') if label == 1: fig2.scatter(X_test[idx][0], X_test[idx][1], c='g') if label == 2: fig2.scatter(X_test[idx][0], X_test[idx][1], c='b') plt.show() return predict, accuracy
def lr_model(self): logisticRegression = LogisticRegression() logisticRegression.fit(self.x_train, self.y_train) y_predicted = logisticRegression.predict(self.x_test) acc_score = accuracy_score(self.y_test, y_predicted) print("Accuracy Score for LR Model :", acc_score) self.model_dict.update({'lr': acc_score})
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll" parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold") parser.add_argument('--iterations',type=int,default=5) args = parser.parse_args() all_feats = [] all_labels = defaultdict(list) scores = defaultdict(list) for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "): # for idx in "01".split(" "): current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll" f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False,generateFeatures=False) for instance_index,l in enumerate(labels_current): all_labels[instance_index].append(l) current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_01.lbl.conll" feats, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=True,generateFeatures=True) for it in range(args.iterations): for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None): maxent = LogisticRegression(penalty='l2') TrainX_i = feats[TrainIndices] Trainy_i = [all_labels[x][random.randrange(0,20)] for x in TrainIndices] TestX_i = feats[TestIndices] Testy_i = [all_labels[x][random.randrange(0,20)] for x in TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i = maxent.predict(TestX_i) acc = accuracy_score(ypred_i, Testy_i) pre = precision_score(ypred_i, Testy_i) rec = recall_score(ypred_i, Testy_i) # shared task uses f1 of *accuracy* and recall! f1 = 2 * acc * rec / (acc + rec) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Precision"].append(pre) scores["Recall"].append(rec) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") sys.exit(0)
def lg(X, y, model_path): model = LogisticRegression() model.fit(X, y) print(model) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def classify_logistic(train_features, train_labels, test_features): global SAVE clf = LogisticRegression() clf.fit(train_features, train_labels) if not TEST and SAVE: save_pickle("logistic", clf) return clf.predict(test_features)
def LRClassifier(data, y): X_train_raw, X_test_raw, y_train, y_test = train_test_split(data, y) classifier = LogisticRegression(fit_intercept=True, intercept_scaling=0.0001) classifier.fit(X_train_raw, y_train) predictions = classifier.predict(X_test_raw) correct = 0 for i in range(0, len(predictions)): print('prediction:%s. ActualY:%s' % (predictions[i], y_test[i])) if y_test[i] == predictions[i]: correct += 1 print("The correction rate: ", (correct / len(y_test)) * 100, "%")
def logistic_regression(X_train, Y_train, X_test, Y_test): ''' train a logistic Regression model, and test its accuracy on some given test data ''' from sklearn.linear_model.logistic import LogisticRegression classifier = LogisticRegression(solver='liblinear') classifier.fit(X_train, Y_train) LR_pred = classifier.predict(X_test) LR_acc = np.mean(LR_pred == Y_test) return LR_acc, classifier
def spamRecog(descr): df = pd.read_csv('./SMSSpamCollection.csv', delimiter='\t',header=None) X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0]) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) X_test = vectorizer.transform( [descr] ) predictions = classifier.predict(X_test) return predictions
def test_liblinear_decision_function_zero(): # Test negative prediction when decision_function values are zero. # Liblinear predicts the positive class when decision_function values # are zero. This is a test to verify that we do not do the same. # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600 # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623 X, y = make_classification(n_samples=5, n_features=5) clf = LogisticRegression(fit_intercept=False) clf.fit(X, y) # Dummy data such that the decision function becomes zero. X = np.zeros((5, 5)) assert_array_equal(clf.predict(X), np.zeros(5))
def Logistic_Regression(x_train, y_train, x_test, y_test): classifier = LogisticRegression() classifier.fit(x_train, y_train.astype('int')) y_predict = classifier.predict(x_test) total = 0 right = 0 for i in range(len(y_predict)): if y_predict[i] == y_test[i]: right += 1 total += 1 acc = float(right / total) print('Logistic Regression val accuarcy: ' + str(acc)) return acc
def test_liblinear_decision_function_zero(): # Test negative prediction when decision_function values are zero. # Liblinear predicts the positive class when decision_function values # are zero. This is a test to verify that we do not do the same. # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600 # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623 X, y = make_classification(n_samples=5, n_features=5, random_state=0) clf = LogisticRegression(fit_intercept=False) clf.fit(X, y) # Dummy data such that the decision function becomes zero. X = np.zeros((5, 5)) assert_array_equal(clf.predict(X), np.zeros(5))
def crossval(features, labels, variant): maxent = LogisticRegression(penalty='l2') dummyclass = DummyClassifier("most_frequent") scores = defaultdict(list) preds = [] dummypreds = [] shuffled_gold = [] for TrainIndices, TestIndices in cross_validation.KFold( n=features.shape[0], n_folds=10, shuffle=True): # print(TestIndices) TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] shuffled_gold.extend(Testy_i) dummyclass.fit(TrainX_i, Trainy_i) maxent.fit(TrainX_i, Trainy_i) ypred_i = maxent.predict(TestX_i) ydummypred_i = dummyclass.predict(TestX_i) dummypreds.extend(ydummypred_i) acc = accuracy_score(y_true=Testy_i, y_pred=ypred_i) f1 = f1_score(y_true=Testy_i, y_pred=ypred_i) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Recall"].append(acc) scores["Accuracy_dummy"].append( accuracy_score(y_true=Testy_i, y_pred=ydummypred_i)) scores["F1_dummy"].append(f1_score(y_true=Testy_i, y_pred=ydummypred_i)) preds.extend(ypred_i) print("summary %s %.3f %.3f %.3f %.3f" % (variant, np.array(scores["Accuracy"]).mean(), np.array( scores["F1"]).mean(), np.array(scores["Accuracy_dummy"]).mean(), np.array(scores["F1_dummy"]).mean())) print(classification_report(y_pred=preds, y_true=shuffled_gold)) labels_to_print = sorted(set(shuffled_gold)) CM = confusion_matrix(y_pred=preds, y_true=shuffled_gold, labels=labels_to_print) print(sorted(set(shuffled_gold))) for l, r in zip(labels_to_print, CM): print(l, "\t".join([str(x) for x in r])) scores = None
class DetectMalicious(): def __init__(self): benign_data = np.loadtxt('javascript-collection/benignjs.csv', delimiter=',', dtype=np.int32) evil_data = np.loadtxt('javascript-collection/eviljs.csv', delimiter=',', dtype=np.int32) train_data = np.concatenate((benign_data, evil_data), axis=0) self.score_template = 'TPR %(TPR)f\tFPR %(FPR)f\tAccuracy %(Accuracy)f\tAUC %(AUC)f' self.D = LogisticRegression() self.D.fit(train_data[:, :-1], train_data[:, -1]) self.jsapi = [] for line in open('javascript-collection/jsapi.txt'): self.jsapi.append(line.strip('\n')) def predict(self, X): flag = [0 for x in range(len(self.jsapi))] for i in range(len(self.jsapi)): if X.find(self.jsapi[i]) != -1: flag[i] = 1 print self.D.predict([flag]) return self.D.predict([flag])
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll" parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold") args = parser.parse_args() all_feats = [] all_labels = [] scores = defaultdict(list) for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "): # for idx in "01".split(" "): current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll" f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False) all_feats.extend(f_current) all_labels.extend(labels_current) feats = DictVectorizer().fit_transform(all_feats).toarray() all_labels = np.asarray(all_labels) for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None): maxent = LogisticRegression(penalty='l2') TrainX_i = feats[TrainIndices] Trainy_i = all_labels[TrainIndices] TestX_i = feats[TestIndices] Testy_i = all_labels[TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i = maxent.predict(TestX_i) acc = accuracy_score(ypred_i, Testy_i) pre = precision_score(ypred_i, Testy_i) rec = recall_score(ypred_i, Testy_i) # shared task uses f1 of *accuracy* and recall! f1 = 2 * acc * rec / (acc + rec) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Precision"].append(pre) scores["Recall"].append(rec) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") sys.exit(0)
def classify(self, input_file, output_file): df = pd.read_csv(input_file) df.columns = [ 'Id', 'OwnerUserId', 'CreationDate', 'ParentId', 'Score', 'IsAcceptedAnswer', 'Body' ] df['Class'] = np.sign(df['Score']) print("Whole size: " + str(len(df))) i = 0 while i < len(df['Body']): if df['Class'][i] <= 0: df['Class'][i] = 0 change = re.sub(r"(</.*>)", "", df['Body'][i]) change = re.sub(r"(<.*>)", "", change) stemmer = SnowballStemmer("english") splitted = re.split('\W+', change) singles = [stemmer.stem(word) for word in splitted] new_str = "" new_str += " ".join(singles) df['Body'][i] = new_str i = i + 1 coef_test = 0.1 X_train_raw, X_test_raw, y_train, y_test = train_test_split( df['Body'], df['Class'], test_size=coef_test) test_size = int(np.math.ceil(coef_test * len(df))) X = np.array(df["Class"]) X_lately = X[-test_size:] vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) X_test = vectorizer.transform(X_test_raw) predictions = classifier.predict(X_test) i = 0 cnt_right = 0 file = open(output_file, 'w') while i < len(predictions): file.write(str(predictions[i])) if predictions[i] == X_lately[i]: cnt_right = cnt_right + 1 i = i + 1 right_predictions = cnt_right / len(predictions) print("Accuracy of predictions: " + str(right_predictions) + "%")
def clazzify(train_mat, test_mat, true_train_labels): """ """ # learn logging.info('learning...') model = LogisticRegression(random_state=17, penalty='l1') model.fit(train_mat, true_train_labels) logging.info('finished learning.') # test logging.info('testing') predicted_test_labels = model.predict(test_mat) logging.info('finished testing') return predicted_test_labels, model
def answer(test_path): import time t0 = time.time() from learning import process_test_data, training_data, training_answers from sklearn.linear_model.logistic import LogisticRegression test_data = process_test_data(test_path) lr = LogisticRegression() lr.fit(training_data, training_answers) ans = lr.predict(test_data).item() t1 = time.time() return [ans, t1 - t0]
class LogisticClassifier(Model): """Multi-label logistic classifier class.""" def __init__(self, epochs): super(LogisticClassifier, self).__init__("logistic regression") self.max_epochs = epochs self.lr = LogisticRegression(max_iter=epochs) def train(self, train_x, train_y): print "Training {} model.......".format(self.name) self.lr.fit(train_x, train_y) print "Training complete!!" def test(self, test_x): test_y = self.lr.predict(test_x) print "Successfully generated predictions for test data." return test_y
def test_predict_iris(): """Test logistic regression with the iris dataset""" n_samples, n_features = iris.data.shape target = iris.target_names[iris.target] clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target) assert_array_equal(np.unique(target), clf.classes_) pred = clf.predict(iris.data) assert_greater(np.mean(pred == target), .95) probabilities = clf.predict_proba(iris.data) assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples)) pred = iris.target_names[probabilities.argmax(axis=1)] assert_greater(np.mean(pred == target), .95)
def train_and_predics(featuredicts, labels, trainsize): vec = DictVectorizer() y_train = labels[:trainsize] X_train = vec.fit_transform(featuredicts[:trainsize]) X_test = vec.transform(featuredicts[trainsize:]) maxent = LogisticRegression(penalty='l2') maxent.fit(X_train, y_train) predictions = [] #header = "\t".join(["prediction"]+[str(c) for c in maxent.classes_]) #predictions.append(header) for list, label in zip(maxent.predict_proba(X_test), maxent.predict(X_test)): line = "\t".join([label] + ["{0:.2f}".format(k) for k in list]) predictions.append(line) return predictions
def generate_submission(): global alg, predictions, submission # The columns we'll use to predict the target # Initialize the algorithm class alg = LogisticRegression(random_state=1) # Train the algorithm using all the training data alg.fit(train[predictors], train["Survived"]) # Make predictions using the test set. predictions = alg.predict(test[predictors]) # Create a new dataframe with only the columns Kaggle wants from the dataset. submission = pandas.DataFrame({ "PassengerId": test["PassengerId"], "Survived": predictions }) submission.to_csv("kaggle.csv", index=False) print("kaggele.csv is generated")
def Logistic_Regression(train, test): x_train = train[:, :-1] y_train = train[:, -1] x_test = test[:, :-1] y_test = test[:, -1] classifier = LogisticRegression() classifier.fit(x_train, y_train) y_predict = classifier.predict(x_test) total = 0 right = 0 for i in range(len(y_predict)): if y_predict[i] == y_test[i]: right += 1 total += 1 acc = float(right / total) print('Logistic Regression train accuarcy: ' + str(acc)) return acc
def test_multinomial_binary(): """Test multinomial LR on a binary problem.""" target = (iris.target > 0).astype(np.intp) target = np.array(["setosa", "not-setosa"])[target] clf = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf.fit(iris.data, target) assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) assert_equal(clf.intercept_.shape, (1,)) assert_array_equal(clf.predict(iris.data), target) mlr = LogisticRegression(solver='lbfgs', multi_class='multinomial', fit_intercept=False) mlr.fit(iris.data, target) pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)] assert_greater(np.mean(pred == target), .9)
class mentoryWEB: def __init__(self, file): self.vect = TfidfVectorizer(max_df=0.25, stop_words=None, max_features=2500, ngram_range=(1,2), use_idf=True, norm='l2') df = pd.read_csv(file, delimiter='\t', header=None) X_train_raw, y_train = df[1], df[0] X_train = self.vect.fit_transform(X_train_raw) self.clf = LogisticRegression(penalty='l2', C=10) self.clf.fit(X_train, y_train) def test(self, string): X_test = self.vect.transform([string]) prediction = self.clf.predict(X_test) return prediction[0]
def LogisticRegressionSMSFilteringExample(): import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split, cross_val_score df = pd.read_csv('C:/Users/Ahmad/Documents/Mastering ML with Scikitlearn/ml/DataSets/smsspamcollection/SMSSpamCollection', delimiter='\t',header=None) X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0]) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) for i in xrange(0,5): print X_test_raw.values.tolist()[i],"\r\n Classification: ", predictions[i]
def makeClassificationAndMeasureAccuracy(genre_wise_train_data, genre_wise_test_data, meta_dict): accuracy_for_genre = dict() for genre in genre_wise_train_data: meta_dict_for_genre = meta_dict[genre] train_data, train_result = genre_wise_train_data[genre] test_data, test_result = genre_wise_test_data[genre] train_data = [list(meta_dict_for_genre[file_name][TAGS].values()) for file_name in train_data] test_data = [list(meta_dict_for_genre[file_name][TAGS].values()) for file_name in test_data] log_r = LogisticRegression() log_r.fit(train_data, train_result) accuracy = 0.0 for i in range(len(test_data)): label = int(log_r.predict(test_data[i])) if label == test_result[i]: accuracy += 1.0 accuracy = accuracy/len(test_data) accuracy_for_genre[genre] = accuracy return accuracy_for_genre
def test_multinomial_binary(): # Test multinomial LR on a binary problem. target = (iris.target > 0).astype(np.intp) target = np.array(["setosa", "not-setosa"])[target] for solver in ['lbfgs', 'newton-cg', 'sag']: clf = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, max_iter=2000) clf.fit(iris.data, target) assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) assert_equal(clf.intercept_.shape, (1,)) assert_array_equal(clf.predict(iris.data), target) mlr = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, fit_intercept=False) mlr.fit(iris.data, target) pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)] assert_greater(np.mean(pred == target), .9)
def test_multinomial_logistic_regression_string_inputs(): # Test with string labels for LogisticRegression(CV) n_samples, n_features, n_classes = 50, 5, 3 X_ref, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=3, random_state=0) y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y) # For numerical labels, let y values be taken from set (-1, 0, 1) y = np.array(y) - 1 # Test for string labels lr = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr.fit(X_ref, y) lr_cv.fit(X_ref, y) lr_str.fit(X_ref, y_str) lr_cv_str.fit(X_ref, y_str) assert_array_almost_equal(lr.coef_, lr_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo']) # The predictions should be in original labels assert_equal(sorted(np.unique(lr_str.predict(X_ref))), ['bar', 'baz', 'foo']) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz', 'foo']) # Make sure class weights can be given with string labels lr_cv_str = LogisticRegression( solver='lbfgs', class_weight={'bar': 1, 'baz': 2, 'foo': 0}, multi_class='multinomial').fit(X_ref, y_str) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender', classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR', cv=10): instance_num = len(data_set_df.columns) df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered if features is None else df_filtered.loc[features] x = x.dropna(how='all', axis=0) x = x.dropna(how='all', axis=1) if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any(): x_imp = pc.fill_nan_features(x) # x_imp = dense_df.loc[x.index, x.columns] else: x_imp = x y_filtered = y_v[(map(int, x.columns.values))] clf = LogisticRegression(C=reg_param) if classifier is None else classifier cv_num = min(len(y_filtered), cv) score_mean = 0.0 miss_clf_rate = 1.0 if cv_num > 1 and len(y_filtered.unique()) > 1: kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True) # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True) fold = 0 result_str = "" matrix_str = "" for tr_index, te_index in kf: fold += 1 x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index] y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index] if selection: if sel_method == 'LR' or 'RF' in sel_method: feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat, method=sel_method, label=label) else: x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat, method=sel_method, label=label) x_train = x_train.loc[:, feat_index].values x_test = x_test.loc[:, feat_index].values try: clf.fit(x_train, y_train) score = clf.score(x_test, y_test) score_mean += score result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \ % (label, True if param.FILL_SUFFIX in feat_set_name else False, True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR', reg_param, cv, fold, x_train.shape[1], score) cf_mat = confusion_matrix(y_test, clf.predict(x_test), labels=range(len(info.LABEL_CATEGORY[label]))) matrix_str += np.array_str(cf_mat) + "\n" except ValueError: pass # traceback.print_exc() # print i, "why error? skip!" print result_str file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(result_str) file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(matrix_str) if fold > 0: score_mean = score_mean / fold miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num) return score_mean, miss_clf_rate
def run(): paras, sents = create_dataset() X = np.array(get_features(paras)) Y = np.array(get_ys(paras)) print len(X[0]) sents = np.array(sents) skf = StratifiedKFold(Y, n_folds=10) f = open('results/correct.txt','w') f2 = open('results/wrong.txt','w') accs = [] precs = [] recs = [] f1s = [] for train_index, test_index in skf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] sent_train = sents[train_index] sent_test = sents[test_index] # cv = CountVectorizer(stop_words="english", ngram_range=(1,1), min_df = 5) # sent_train_counts = cv.fit_transform(sent_train) # # tf_transformer = TfidfTransformer(use_idf=True).fit(sent_train_counts) # sent_train_counts = tf_transformer.transform(sent_train_counts) # # sent_train_counts = sent_train_counts.toarray() # # print sent_train_counts.shape # print X_train.shape # # new_train = [] # for i,j in zip(X_train, sent_train_counts): # new_train.append(np.append(i,j)) #fs = SelectKBest(chi2, k=24) #X_train = fs.fit_transform(X_train, y_train) clf = LogisticRegression() clf.fit(X_train, y_train) print clf.coef_ # # sent_test_counts = cv.transform(sent_test) # sent_test_counts = tf_transformer.transform(sent_test_counts) # # sent_test_counts = sent_test_counts.toarray() # # new_test = [] # for i,j in zip(X_test, sent_test_counts): # new_test.append(np.append(i,j)) #X_test = fs.transform(X_test) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred) rec = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) accs.append(acc) precs.append(prec) recs.append(rec) f1s.append(f1) print 'Acc \t %s' % acc print 'Prec \t %s' % prec print 'Recall \t %s' % rec print 'F1 \t %s' % f1 for (index,test),(y_t, y_p) in zip(zip(test_index, X_test), zip(y_test, y_pred)): if y_t == y_p: # if paras[index]['prev_para']: # f.write('%s\n' % paras[index]['prev_para']['sents']) f.write('%s\n' % sents[index]) f.write('%s\n' % (y_t)) else: # if paras[index]['prev_para']: # f2.write('%s\n' % paras[index]['prev_para']['sents']) f2.write('%s\n' % sents[index]) f2.write('%s\n' % (y_t)) print 'Avg Acc \t %s \t ' % np.mean(accs) print 'Avg Prec \t %s' % np.mean(precs) print 'Avg Recall \t %s' % np.mean(recs) print 'Avg F1 \t %s' % np.mean(f1s)
def train_model(clf_factory, X, Y, name, plot=False): """ Trains and saves model to disk. """ labels = np.unique(Y) cv = ShuffleSplit( n=len(X), n_iterations=1, test_fraction=0.3, indices=True, random_state=0) #print "cv = ",cv train_errors = [] test_errors = [] scores = [] pr_scores, precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list) roc_scores, tprs, fprs = defaultdict(list), defaultdict(list) ,defaultdict(list) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] global clf clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: #print("Plotting %s"%genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) #plot_pr(pr_scores[label][median], desc, precisions[label][median],recalls[label][median], label='%s vs rest' % genre_list[label]) #plot_roc(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores),np.mean(all_pr_scores), np.std(all_pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model_fft/my_model.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
from sklearn.cross_validation import train_test_split,cross_val_score df = pd.read_csv('SMSSpamCollection',delimiter = '\t',header = None) # print(df.head) print('Number of spam messages :',df[df[0]=='spam'][0].count()) print('Number of ham messages:',df[df[0]=='ham'][0].count()) x_train_raw,x_test_raw,y_train,y_test = train_test_split(df[1],df[0]) vectorizer = TfidfVectorizer() x_train = vectorizer.fit_transform(x_train_raw) x_test = vectorizer.transform(x_test_raw) classifier = LogisticRegression() classifier.fit(x_train,y_train) predictions = classifier.predict(x_test) # for i, prediction in enumerate(predictions[:5]): # print(prediction,x_test_raw[:i]) from sklearn.metrics import accuracy_score print('Accuracy scores:',accuracy_score(y_test,predictions)) from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt conf_matrix=confusion_matrix(y_test,predictions) print(conf_matrix) plt.matshow(conf_matrix)
def crossval(features, labels, vec): maxent = LogisticRegression(penalty='l1') #maxent = SGDClassifier(penalty='l1') #maxent = Perceptron(penalty='l1') maxent.fit(features,labels) # only needed for feature inspection, crossvalidation calls fit(), too coeffcounter = Counter(vec.feature_names_) negfeats = set(vec.feature_names_) posfeats = set(vec.feature_names_) scores = defaultdict(list) TotalCoeffCounter = Counter() for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None): TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i = maxent.predict(TestX_i) coeffs_i = list(maxent.coef_[0]) coeffcounter_i = Counter(vec.feature_names_) for value,name in zip(coeffs_i,vec.feature_names_): coeffcounter_i[name] = value acc = accuracy_score(ypred_i, Testy_i) pre = precision_score(ypred_i, Testy_i) rec = recall_score(ypred_i, Testy_i) # shared task uses f1 of *accuracy* and recall! f1 = 2 * acc * rec / (acc + rec) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Precision"].append(pre) scores["Recall"].append(rec) posfeats = posfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[:20]])) negfeats = negfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[-20:]])) print("Pervasive positive: ", posfeats) print("Pervasive negative: ",negfeats) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") maxent.fit(features,labels) # fit on everything coeffs_total = list(maxent.coef_[0]) for value,name in zip(coeffs_total,vec.feature_names_): TotalCoeffCounter[name] = value for (key,value) in TotalCoeffCounter.most_common()[:20]: print(key,value) print("---") for (key,value) in TotalCoeffCounter.most_common()[-20:]: print(key,value) print("lowest coeff:",coeffcounter.most_common()[-1]) print("highest coeff",coeffcounter.most_common()[0])
return docs, t_docs, t_docsCategories data = readData('hackerrank/documentClassification.txt') X_train = np.array(data[1]) y_train = np.array(data[2]) X_test = np.array(data[0]) print("Extracting features from the training dataset using a sparse vectorizer") #vectorizer = HashingVectorizer(stop_words='english', non_negative=True) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') X_train = vectorizer.fit_transform(X_train) #vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, # stop_words='english') #X2_train = vectorizer.fit_transform(data_train.data) X_test = vectorizer.transform(X_test) nb_classifier = MultinomialNB().fit(X_train, y_train) svm_classifier = LinearSVC().fit(X_train, y_train) maxent_classifier = LogisticRegression().fit(X_train, y_train) y_nb_predicted = nb_classifier.predict(X_test) print(y_nb_predicted) y_nb_predicted = svm_classifier.predict(X_test) print(y_nb_predicted) y_nb_predicted = maxent_classifier.predict(X_test) print(y_nb_predicted)
return lambda train, test: feat6_generic(train, test, tw_pos, blog_pos) def feat6_tw(): return lambda train, test: feat6_generic(train, test, tw_pos, twitter_test_pos) print "Experiment 6: valence + punctuation + key POS word counts blog(80%) -> blog(20%)" experiment6_b = experiment_svm_sigK(blog_80, blog_20, feat6_b()) print "Experiment 6: valence + punctuation + key POS word counts twitter+wiki -> blog" experiment6_twb = experiment_svm_sigK(tw, blog, feat6_tw_b()) print "Experiment 6: valence + punctuation + key POS word counts twitter+wiki -> twitter(test)" experiment6_tw = experiment_svm_sigK(tw, twitter_test, feat6_tw()) # Cross validation for blog -> blog experiment with best accuracy (to compare to original paper) folds = KFold(n = len(blog), n_folds= 10, random_state = 1) test_accuracies = [] for train_indices, test_indices in folds: train_data = get_elems_at(blog, train_indices) test_data = get_elems_at(blog, test_indices) data = Features.make_experiment_matrices(train_data, test_data, feat4) model = LogisticRegression() model.fit(data['train_X'], data['train_Y']) predictions = model.predict(data['test_X']) accuracy = accuracy_score(data['test_Y'], predictions) test_accuracies.append(accuracy) print "10-CV accuracy blog on blog:%.2f[+/-%.2f]" % (numpy.mean(test_accuracies), numpy.std(test_accuracies))
def train_model(X, Y, name, plot=False): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. """ labels = np.unique(Y) print labels cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # for the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model/model_ceps.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
# X_test, y_test = X[N:], np.array(y[N:]) N_train = int(len(X)*6/10) N_valid = int(len(X)*8/10) X_train, y_train = X[:N_train], y[:N_train] X_valid, y_valid = X[N_train:N_valid],y[N_train:N_valid] X_test, y_test = X[N_valid:], np.array(y[N_valid:]) Cs = np.logspace(-2, 5, 10) valid_predict = [] for C in Cs: estimator = LogisticRegression(class_weight='auto', C=C) estimator.fit(X_train, y_train) y_predict_val = estimator.predict(X_valid) valid_predict.append(1.0 * np.sum(y_predict_val == y_valid) / len(y_valid)) valid_predict = np.array(valid_predict) C = Cs[np.argmax(valid_predict)] print("C:",C, "Accurary(valid):", np.max(valid_predict)) # estimator = RandomForestClassifier(n_estimators=200) estimator = LogisticRegression(class_weight='auto', C=C) estimator.fit(X_train, y_train) y_predict = estimator.predict(X_test) print(y_predict) print(y_test) print ("Accurary(test):",1.0 * np.sum(y_predict == y_test) / len(y_test))
## 5:21 100-400 Hz X.append(2.0 / N * np.abs(yf[:N/2])[:21]) # print xf[np.argmax(2.0/N * np.abs(yf[:N/2]))] ## pitch # plt.plot(xf, 2.0/N * np.abs(yf[:N/2])) # plt.show(block=True) return X nasal = glob('/Users/lxy/Dropbox/Voice Autism Vocal Samples/Nasalized/*') normal = glob('/Users/lxy/Dropbox/Voice Autism Vocal Samples/Normal/*') random.shuffle(nasal) random.shuffle(normal) X, y = [], [] for filename in nasal: data = parseData(filename) X += data y += [1] * len(data) for filename in normal: data = parseData(filename) X += data y += [0] * len(data) N = len(X) * 9 / 10 X_train, y_train = X[:N], y[:N] X_test, y_test = X[N:], np.array(y[N:]) # estimator = RandomForestClassifier(n_estimators=200) estimator = LogisticRegression(class_weight='auto', C=8.0) estimator.fit(X_train, y_train) y_predict = estimator.predict(X_test) print 1.0 * np.sum(y_predict == y_test) / len(y_test)
def main(): parser = argparse.ArgumentParser(description="""Export AMT""") parser.add_argument('--input', default="../res/dga_extendedamt_simplemajority.tsv") parser.add_argument('--dump_to_predict', default="../res/dga_data_october2016.tsv") parser.add_argument('--embeddings', default="/Users/hmartine/data/glove.6B/glove.6B.50d.txt") args = parser.parse_args() E = load_embeddings(args.embeddings) predarrays = {} variants = ["bcd","cd"] for variant in variants: #1 collect features for train trainfeatures, labels, vec = collect_features(args.input,embeddings=E,variant=variant,vectorize=False) maxent = LogisticRegression(penalty='l2') #TODO collect features for new data #TODO proper vectorization dumpfeatdicts = features_from_dump(args.dump_to_predict,variant=variant,embeddings=E,bowfilter=trainingbow) #dumpfeats = vec.fit_transform(dumpfeatdicts) vec = DictVectorizer() X_train = vec.fit_transform(trainfeatures) maxent.fit(X_train,labels) X_test = vec.transform(dumpfeatdicts) predarrays[variant+"_pred_label"] = ["SAME" if x == 0 else "OMISSION" for x in maxent.predict(X_test)] predarrays[variant + "_pred_prob"] = ['{:.2}'.format(y) for x,y in maxent.predict_proba(X_test)] #maxent.fit(np.array(allfeatures[:len(labels)]),labels) #print(maxent.predict(allfeatures[len(labels):])) # predict using {features, features without lenght} --> instance 'variants' properly #TODO compare prediction similarity #TODO provide an output format with labels and probs for both feature templates frame = read_dump(args.dump_to_predict) keyindices = sorted(predarrays.keys()) header = "Index Ref TitleRef URLRef Target TitleTarget URLTarget Source Contains BCD_label BCD_prob CD_label CD_prob".replace(" ","\t") print(header) for a in zip([str(x) for x in range(len(frame.Ref))],list(frame.Ref),list(frame.Target),list(frame.TitleRef),list(frame.URLRef),list(frame.TitleTarget),list(frame.URLTarget),list(frame.Source),list(frame.Contains),predarrays[keyindices[0]],predarrays[keyindices[1]],predarrays[keyindices[2]],predarrays[keyindices[3]]): print("\t".join(a))
def crossval(features, labels,variant,printcoeffs=False): maxent = LogisticRegression(penalty='l2') dummyclass = DummyClassifier("most_frequent") #maxent = SGDClassifier(penalty='l1') #maxent = Perceptron(penalty='l1') maxent.fit(features,labels) # only needed for feature inspection, crossvalidation calls fit(), too scores = defaultdict(list) TotalCoeffCounter = Counter() for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None): TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] dummyclass.fit(TrainX_i,Trainy_i) maxent.fit(TrainX_i,Trainy_i) ypred_i = maxent.predict(TestX_i) ydummypred_i = dummyclass.predict(TestX_i) #coeffs_i = list(maxent.coef_[0]) #coeffcounter_i = Counter(vec.feature_names_) #for value,name in zip(coeffs_i,vec.feature_names_): # coeffcounter_i[name] = value acc = accuracy_score(ypred_i, Testy_i) #pre = precision_score(ypred_i, Testy_i,pos_label=1) #rec = recall_score(ypred_i, Testy_i,pos_label=1) f1 = f1_score(ypred_i, Testy_i,pos_label=1) scores["Accuracy"].append(acc) scores["F1"].append(f1) #scores["Precision"].append(pre) #scores["Recall"].append(rec) # # acc = accuracy_score(ydummypred_i, Testy_i) # pre = precision_score(ydummypred_i, Testy_i,pos_label=1) # rec = recall_score(ydummypred_i, Testy_i,pos_label=1) # f1 = f1_score(ydummypred_i, Testy_i,pos_label=1) # # scores["dummy-Accuracy"].append(acc) # scores["dummy-F1"].append(f1) # scores["dummy-Precision"].append(pre) # scores["dummy-Recall"].append(rec) #posfeats = posfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[:20]])) #negfeats = negfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[-20:]])) #print("Pervasive positive: ", posfeats) #print("Pervasive negative: ",negfeats) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) #print("--") #for key in sorted(scores.keys()): # currentmetric = np.array(scores[key]) #print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) #print("%s : %0.2f" % (key,currentmetric.mean())) print("%s %.2f (%.2f)" % (variant,np.array(scores["Accuracy"]).mean(),np.array(scores["F1"]).mean())) if printcoeffs: maxent.fit(features,labels) # fit on everything coeffs_total = list(maxent.coef_[0]) for (key,value) in TotalCoeffCounter.most_common()[:20]: print(key,value) print("---") for (key,value) in TotalCoeffCounter.most_common()[-20:]: print(key,value)
def test_fit_credit_backupsklearn(): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.LogisticRegression enet_h2o4gpu = Solver(glm_stop_early=False) print("h2o4gpu fit()") enet_h2o4gpu.fit(X, y) print("h2o4gpu predict()") print(enet_h2o4gpu.predict(X)) print("h2o4gpu score()") print(enet_h2o4gpu.score(X,y)) enet = Solver(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234) print("h2o4gpu scikit wrapper fit()") enet.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet.predict(X)) print("h2o4gpu scikit wrapper predict_proba()") print(enet.predict_proba(X)) print("h2o4gpu scikit wrapper predict_log_proba()") print(enet.predict_log_proba(X)) print("h2o4gpu scikit wrapper score()") print(enet.score(X,y)) print("h2o4gpu scikit wrapper decision_function()") print(enet.decision_function(X)) print("h2o4gpu scikit wrapper densify()") print(enet.densify()) print("h2o4gpu scikit wrapper sparsify") print(enet.sparsify()) from sklearn.linear_model.logistic import LogisticRegression enet_sk = LogisticRegression(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit predict_proba()") print(enet_sk.predict_proba(X)) print("Scikit predict_log_proba()") print(enet_sk.predict_log_proba(X)) print("Scikit score()") print(enet_sk.score(X,y)) print("Scikit decision_function()") print(enet_sk.decision_function(X)) print("Scikit densify()") print(enet_sk.densify()) print("Sciki sparsify") print(enet_sk.sparsify()) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() print(enet_sk.coef_) print(enet_sk_coef) print(enet.coef_) print(enet_sk.intercept_) print("Coeffs, intercept, and n_iters should match") assert np.allclose(enet.coef_, enet_sk_coef) assert np.allclose(enet.intercept_, enet_sk.intercept_) assert np.allclose(enet.n_iter_, enet_sk.n_iter_) print("Preds should match") assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X)) assert np.allclose(enet.predict(X), enet_sk.predict(X)) assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
#decmap = defaultdict(int) #for x,y in zip(test_labels,dec_pred1): # decmap[(x,y)] += 1 #for x,y in decmap.keys(): # print 'Actual Decade : ' + str(x) + ' Decade Predicted ' + str(y) + ' Count : ' + str(decmap[(x,y)]) accuracy = zero_one_score(y_test, dec_pred1) print 'Accuracy with SVM : ' + str(accuracy) clf3 = MultinomialNB().fit(X_train,y_train) dec_pred3 = clf3.predict(X_test) accuracy = zero_one_score(y_test, dec_pred1) print 'Accuracy with Naive Bayes : ' + str(accuracy) n_neighbors = 15 clf2 = neighbors.KNeighborsClassifier(n_neighbors) clf2 = clf2.fit(X_train,y_train) dec_pred2 = clf2.predict(X_test) accuracy = zero_one_score(y_test, dec_pred2) print 'Accuracy with Nearest Neighbors : ' + str(accuracy) clf2 = LogisticRegression().fit(X_train,y_train) dec_pred2 = clf2.predict(X_test) accuracy = zero_one_score(y_test, dec_pred2) print 'Accuracy with Logistic Regression : ' + str(accuracy) lyricsfile.close()
def logreg_score(X, y): logreg = LogisticRegression() logreg.fit(X, y) y_pred = logreg.predict(X) print "LogReg accuracy_score: {}".format(metrics.accuracy_score(y, y_pred))
def MyClassifier(): scriptdir = os.path.dirname(os.path.realpath(__file__)) defaultdata = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll" parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016") parser.add_argument('--train', help="parsed-and-label input format", default=defaultdata) args = parser.parse_args() labels = [] featuredicts = [] print("Collecting features...") count=0 for s in readSentences(args.train): print("\r"+str(count), end="") count+=1 for l,i in zip(s["label"],s["idx"]): if l != "-": w = WordInContext(s, i, s["form"][i],s["lemma"][i],s["pos"][i],s["ne"][i],l,s["head"],s["deprel"]) featuredicts.append(w.featurize()) labels.append(w.label) print() vec = DictVectorizer() features = vec.fit_transform(featuredicts).toarray() labels = np.array(labels) maxent = LogisticRegression(penalty='l1') #maxent = SGDClassifier(penalty='l1') #maxent = Perceptron(penalty='l1') maxent.fit(features,labels) # only needed for feature inspection, crossvalidation calls fit(), too coeffcounter = Counter(vec.feature_names_) negfeats = set(vec.feature_names_) posfeats = set(vec.feature_names_) scores = defaultdict(list) TotalCoeffCounter = Counter() for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None): TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i = maxent.predict(TestX_i) coeffs_i = list(maxent.coef_[0]) coeffcounter_i = Counter(vec.feature_names_) for value,name in zip(coeffs_i,vec.feature_names_): coeffcounter_i[name] = value scores["Accuracy"].append(accuracy_score(ypred_i,Testy_i)) scores["F1"].append(f1_score(ypred_i,Testy_i)) scores["Precision"].append(precision_score(ypred_i,Testy_i)) scores["Recall"].append(recall_score(ypred_i,Testy_i)) posfeats = posfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[:20]])) negfeats = negfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[-20:]])) print("Pervasive positive: ", posfeats) print("Pervasive negative: ",negfeats) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") maxent.fit(features,labels) # fit on everything coeffs_total = list(maxent.coef_[0]) for value,name in zip(coeffs_total,vec.feature_names_): TotalCoeffCounter[name] = value for (key,value) in TotalCoeffCounter.most_common()[:20]: print(key,value) print("---") for (key,value) in TotalCoeffCounter.most_common()[-20:]: print(key,value) print("lowest coeff:",coeffcounter.most_common()[-1]) print("highest coeff",coeffcounter.most_common()[0]) sys.exit(0)
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S") titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0 titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1 titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2 predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] # Initialize the algorithm class #alg = LogisticRegression(random_state=1) alg = LogisticRegression() # Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # Train the algorithm using all the training data alg.fit(titanic[predictors], titanic["Survived"]) # Make predictions using the test set. predictions = alg.predict(titanic_test[predictors]) # Create a new dataframe with only the columns Kaggle wants from the dataset. submission = pandas.DataFrame({ "PassengerId": titanic_test["PassengerId"], "Survived": predictions }) submission.to_csv("kaggle.csv", index=False)