def train_and_predict_m7 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Passive-Aggressive Classifer...")
    clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]}
    #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
def test_classifier_refit():
    # Classifier can be retrained on different labels and features.
    clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))

    clf.fit(X[:, :-1], iris.target_names[y])
    assert_array_equal(clf.classes_, iris.target_names)
Exemplo n.º 3
0
def PassiveAggressiveClassifier_1(train_predictors,test_predictors,train_target,test_target):
    clf = PassiveAggressiveClassifier()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Linear Model PassiveAggressiveClassifier: "+str(accuracy)
    return accuracy,predicted 
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            clf = PassiveAggressiveClassifier(C=1.0, n_iter=30,
                                              fit_intercept=fit_intercept,
                                              random_state=0)
            clf.fit(data, y)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
Exemplo n.º 5
0
def train_online_model(xtr, ytr, model=None):
    # Train classifier
    t0 = time.time()
    if model is None:
        model = PassiveAggressiveClassifier()
        model.fit(xtr, ytr)
    else:
        model.partial_fit(xtr, ytr)
    print "Training took %.2f seconds" % (time.time()-t0)
    return model
def test_classifier_partial_fit():
    classes = np.unique(y)
    for data in (X, X_csr):
            clf = PassiveAggressiveClassifier(C=1.0,
                                              fit_intercept=True,
                                              random_state=0)
            for t in range(30):
                clf.partial_fit(data, y, classes)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
Exemplo n.º 7
0
def test_passive_aggressive_2():
    """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0"""

    tpot_obj = TPOT()
    result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0)
    result = result[result['group'] == 'testing']

    pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42)
    pagg.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
def mainworker(limit1,limit2):
	N=10
	l=[]
	w1=[] # +1 class
	w2=[]#-1 class
	temp=[]
	classlist=[]
	f=open("pdata.txt")
	for line in f:
        	x=(line.strip("\n")).split(",")
        	temp=[]
        	for i in xrange(len(x)):
			x[i]=int(x[i])
			temp.append(x[i])
        	clas=temp.pop()
		temp=temp[:limit1]+temp[limit2+1:]
        	l.append(temp)
       		classlist.append(clas)
       		"""if(temp[-1]==-1):
                	w2.append(temp)
       		else:
                	w1.append(temp)"""
	f.close()
	X=np.array(l)
	y=np.array(classlist)

	X=np.array(l)
	y=np.array(classlist)
	karray=[2,3,4,5]
	for k in karray:
		kf = cross_validation.KFold(11054, n_folds=k)
		averager=[]
		for train_index,test_index in kf:
		#print("TRAIN:", train_index, "TEST:", test_index)
	   		X_train, X_test = X[train_index], X[test_index]
	   		y_train, y_test = y[train_index], y[test_index]
		#print X_train, len(X_test), len(y_train), len(y_test)
			train_data=[]
	        	test_data=[]
        		train_label=[]
       			test_label=[]
			X1 = X_train#train_data
			Y1 = y_train#train_label	
			clf = PassiveAggressiveClassifier()
			#clf = svm.SVC(kernel='linear')
			clf.fit(X1,Y1)
			Z = X_test#test_data
			predicted = clf.predict(Z)
			accuracy = getAccuracy(predicted, y_test)#test_label)
			averager.append(accuracy)
		answer=np.mean(averager)
		print "The mean for",k,"fold is:"
		print answer
Exemplo n.º 9
0
def TrainSVM(data,labels):
	usealgo = 1
	if usealgo == 0:
		from sklearn.linear_model import PassiveAggressiveClassifier
		clf=PassiveAggressiveClassifier(class_weight='balanced',n_jobs=-1,n_iter=15,fit_intercept=True)
	elif usealgo ==1:
		clf = SVC(probability= True,decision_function_shape='ovr',random_state=np.random.randint(1000),kernel="linear")

	elif usealgo ==2:
		from sklearn.svm import LinearSVC
		clf = LinearSVC()

	clf.fit(data,labels)
	return clf
Exemplo n.º 10
0
class DeployedClassifierFactory:
	def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None):
		'''This is a class that enables one to train and save a classification model.

		Parameters
		----------
		term_doc_matrix : TermDocMatrix
		term_doc_matrix_factory : TermDocMatrixFactory
		category : str
			Category name
		nlp : spacy parser
		'''
		self._term_doc_matrix = term_doc_matrix
		self._term_doc_matrix_factory = term_doc_matrix_factory
		assert term_doc_matrix_factory._nlp is None
		assert term_doc_matrix_factory.category_text_iter is None
		self._category = category
		self._clf = None
		self._proba = None

	def passive_aggressive_train(self):
		'''Trains passive aggressive classifier

		'''
		self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0)
		self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
		y_dist = self._clf.decision_function(self._term_doc_matrix._X)
		pos_ecdf = ECDF(y_dist[y_dist >= 0])
		neg_ecdf = ECDF(y_dist[y_dist <= 0])

		def proba_function(distance_from_hyperplane):
			if distance_from_hyperplane > 0:
				return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
			elif distance_from_hyperplane < 0:
				return pos_ecdf(distance_from_hyperplane) / 2.
			return 0.5

		self._proba = proba_function
		return self

	def build(self):
		'''Builds Depoyed Classifier
		'''
		if self._clf is None:
			raise NeedToTrainExceptionBeforeDeployingException()
		return DeployedClassifier(self._category,
		                          self._term_doc_matrix._category_idx_store,
		                          self._term_doc_matrix._term_idx_store,
		                          self._term_doc_matrix_factory)
def test_classifier_correctness(loss):
    y_bin = y.copy()
    y_bin[y != 1] = -1

    clf1 = MyPassiveAggressive(
        C=1.0, loss=loss, fit_intercept=True, n_iter=2)
    clf1.fit(X, y_bin)

    for data in (X, X_csr):
        clf2 = PassiveAggressiveClassifier(
            C=1.0, loss=loss, fit_intercept=True, max_iter=2,
            shuffle=False, tol=None)
        clf2.fit(data, y_bin)

        assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            for average in (False, True):
                clf = PassiveAggressiveClassifier(
                    C=1.0, max_iter=30, fit_intercept=fit_intercept,
                    random_state=1, average=average, tol=None)
                clf.fit(data, y)
                score = clf.score(data, y)
                assert_greater(score, 0.79)
                if average:
                    assert hasattr(clf, 'average_coef_')
                    assert hasattr(clf, 'average_intercept_')
                    assert hasattr(clf, 'standard_intercept_')
                    assert hasattr(clf, 'standard_coef_')
def test_classifier_partial_fit():
    classes = np.unique(y)
    for data in (X, X_csr):
        for average in (False, True):
            clf = PassiveAggressiveClassifier(
                C=1.0, fit_intercept=True, random_state=0,
                average=average, max_iter=5)
            for t in range(30):
                clf.partial_fit(data, y, classes)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
            if average:
                assert hasattr(clf, 'average_coef_')
                assert hasattr(clf, 'average_intercept_')
                assert hasattr(clf, 'standard_intercept_')
                assert hasattr(clf, 'standard_coef_')
class PassiveAgressiveClassifier(Classifier):
    def __init__(self, matrixdatabase):
        self._matrix_database = matrixdatabase
        self._has_fit = False
        self._occ = OCC(C=0.0083, n_iter=27, loss="hinge")

    def learn(self, ingredients, cuisine):
        return

    def classify(self, ingredients):
        if not self._has_fit:
            matrix, classes = self._matrix_database.make_train_matrix()
            self._occ = self._occ.fit(matrix, classes)
            print "Fitting complete..."
            self._has_fit = True
        output = self._occ.predict(self._matrix_database.make_row_from_recipe(ingredients))
        return output[0]
class PassiveAggressiveModel(BaseModel):
    
    def __init__(self, cached_features):
        BaseModel.__init__(self, cached_features)
        self.model = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, random_state=1)

    def _predict_internal(self, X_test):
        return self.model.predict(X_test)
def test_classifier_correctness():
    y_bin = y.copy()
    y_bin[y != 1] = -1

    for loss in ("hinge", "squared_hinge"):

        clf1 = MyPassiveAggressive(C=1.0,
                                   loss=loss,
                                   fit_intercept=True,
                                   n_iter=2)
        clf1.fit(X, y_bin)

        clf2 = PassiveAggressiveClassifier(C=1.0,
                                           loss=loss,
                                           fit_intercept=True,
                                           n_iter=2)
        clf2.fit(X, y_bin)

        assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
def test_equal_class_weight():
    X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
    y2 = [0, 0, 1, 1]
    clf = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight=None)
    clf.fit(X2, y2)

    # Already balanced, so "balanced" weights should have no effect
    clf_balanced = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
                                               class_weight="balanced")
    clf_balanced.fit(X2, y2)

    clf_weighted = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
                                               class_weight={0: 0.5, 1: 0.5})
    clf_weighted.fit(X2, y2)

    # should be similar up to some epsilon due to learning rate schedule
    assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
    assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
Exemplo n.º 18
0
def main():
    #stemmer = SnowballStemmer('english')
    #stemmer = EnglishStemmer()

    training_data=open('trainingdata.txt', 'rU')
    n = int(training_data.readline().strip())    
    
    train_data = []
    class_data = []

    for i in range(n):
        line = training_data.readline().strip()
        train_data.append(line[1:].strip())
        class_data.append(int(line[0]))
        
    train_data = np.array(train_data)
    class_data = np.array(class_data)


    # 2) Vectorize bag of words
    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True )
    vectorizer.fit(train_data)
    X_train = vectorizer.transform(train_data)
        
  
    
    # Read test data from input
    X_test = np.array([raw_input().strip() for i in range(int(raw_input().strip()))])

    X_test = vectorizer.transform(X_test)

    clf = PassiveAggressiveClassifier(n_iter=9) 
    
    clf.fit(X_train, class_data)
    
    pred = clf.predict(X_test)
    for i in pred:
        print i
	def test_main(self):
		categories, documents = get_docs_categories()
		clean_function = lambda text: '' if text.startswith('[') else text
		entity_types = set(['GPE'])
		term_doc_mat = (
			TermDocMatrixFactory(
				category_text_iter=zip(categories, documents),
				clean_function=clean_function,
				nlp=_testing_nlp,
				feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
			).build()
		)
		clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
		fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
		                   clean_function=clean_function,
		                   feats_from_spacy_doc=FeatsFromSpacyDoc(
			                   entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
		tfidf = TfidfTransformer(norm='l1')
		X = tfidf.fit_transform(term_doc_mat._X)
		clf.fit(X, term_doc_mat._y)
		X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
		pred = clf.predict(tfidf.transform(X_to_predict))
		dec = clf.decision_function(X_to_predict)
def test_class_weights():
    # Test class weights.
    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                   [1.0, 1.0], [1.0, 0.0]])
    y2 = [1, 1, 1, -1, -1]

    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight=None,
                                      random_state=100)
    clf.fit(X2, y2)
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))

    # we give a small weights to class 1
    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100,
                                      class_weight={1: 0.001},
                                      random_state=100)
    clf.fit(X2, y2)

    # now the hyperplane should rotate clock-wise and
    # the prediction on this point should shift
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
Exemplo n.º 21
0
	def passive_aggressive_train(self):
		'''Trains passive aggressive classifier

		'''
		self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0)
		self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
		y_dist = self._clf.decision_function(self._term_doc_matrix._X)
		pos_ecdf = ECDF(y_dist[y_dist >= 0])
		neg_ecdf = ECDF(y_dist[y_dist <= 0])

		def proba_function(distance_from_hyperplane):
			if distance_from_hyperplane > 0:
				return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
			elif distance_from_hyperplane < 0:
				return pos_ecdf(distance_from_hyperplane) / 2.
			return 0.5

		self._proba = proba_function
		return self
Exemplo n.º 22
0
abstracts = [BeautifulSoup(x).get_text() for x in data['abstract']]

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(abstracts)
y = data['type'].to_numpy()

support_vec = svm.SVC(kernel='rbf', C=1000, gamma=0.001)
rf = RandomForestClassifier(criterion='gini',
                            max_features='sqrt',
                            n_estimators=700)
sgd = SGDClassifier(alpha=0.0001,
                    fit_intercept=True,
                    loss='modified_huber',
                    penalty='l2')
pac = PassiveAggressiveClassifier(C=1.0,
                                  early_stopping=True,
                                  fit_intercept=True,
                                  max_iter=2000)

support_vec.fit(X, y)
rf.fit(X, y)
sgd.fit(X, y)
pac.fit(X, y)

# p_data = pd.read_csv('potentially_fake.tsv', sep='\t')
p_data = pd.read_csv('potentially_fake-8000.tsv', sep='\t')

p_abstracts = [BeautifulSoup(x).get_text() for x in p_data['abstract']]
fake_indexes = []
for index in range(len(p_abstracts)):
    tfidf_pred = TfidfVectorizer(vocabulary=tfidf.vocabulary_)
    p_x = tfidf_pred.fit_transform([p_abstracts[index]])
Exemplo n.º 23
0
                        ])

testingD = pd.read_csv("test.tsv",
                       header=None,
                       sep="\t",
                       names=[
                           'ID', 'Label', 'Statement', 'Subject', 'Speaker',
                           'Job', 'State', 'Party', 'Barely True', 'False',
                           'Half True', 'Mostly True', 'Pants On Fire',
                           'Context'
                       ])

df = pd.concat([trainingD, testingD])
df = df.dropna()

passive_aggressive = PassiveAggressiveClassifier(max_iter=500)
tfidf = TfidfVectorizer(stop_words='english', max_df=0.9)

x_train, x_test, y_train, y_test = train_test_split(df["Party"],
                                                    df['Label'],
                                                    test_size=0.2)

tfidf_train = tfidf.fit_transform(x_train)
tfidf_test = tfidf.transform(x_test)

passive_aggressive.fit(tfidf_train, y_train)

y_pred = passive_aggressive.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f' passive aggressive Accuracy: {round(score * 100, 2)}%')
n= int(f.readline())
ln=[]
ls=[]
for _ in range(n):  # change this to n in stead of 3
    x=f.readline()
    xs=x[2:]
    xn=x[0]
    ls.append(xs)
    ln.append(xn)
#stem the words
bag_of_words=vectorizer.fit(ls)
bag_of_words=vectorizer.transform(ls)
cmax=0
for cc in range(1,100):
    #sw=stopwords.words() #stopwords are not supported, requires download
    clf = PassiveAggressiveClassifier(n_iter=9,C=cc/10)
#    svm=LinearSVC(C=cc/10.0)
    clf.fit(bag_of_words,ln)
    
    #Now get input (test) data
    lt=[]
    filename=open("testdata.txt")
    line = filename.readline()
    ntests=int(line)
    for _ in range(ntests):
        lt.append(filename.readline())
    
    bag_of_test_words=vectorizer.transform(lt)
    result=clf.predict(bag_of_test_words)
    actuals=[]
    filename=open("testresults.txt")
Exemplo n.º 25
0
#print X_train_tfidf.shape

ntest = input()
testdoc = []
for t in range(0, ntest):
    doc = raw_input()
    testdoc.append(doc)

X_new_counts = count_vect.transform(testdoc)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
""""
#Naive bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)

#test random forest

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)
"""
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=50)
clf = clf.fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)

for t in range(0, ntest):
    print predicted[t]
Exemplo n.º 26
0
# Naive Bayes : 0.88....

# GridSearch
## RidgeClassifier

#ridge = RidgeClassifier(tol=1e-3, solver="lsqr") 
#alphas = np.logspace(-6, -1, 100)
#clf = GridSearchCV(estimator=ridge, param_grid=dict(alpha=alphas), n_jobs = 3)
#clf.fit(X_train, y_train)

# feature_selection
# selection from model
from sklearn.feature_selection import SelectFromModel

clf = PassiveAggressiveClassifier(C=0.099, n_iter=200, loss='hinge',random_state = 42)

sfm = SelectFromModel(clf, threshold = 0.001)

sfm.fit(X_train, y_train)

X_train_select = sfm.transform(X_train)
X_test_select = sfm.transform(X_test)

# test with new clf
clf1 = PassiveAggressiveClassifier(C=0.5, n_iter=200, loss='hinge',random_state = 42)

benchmark(clf1, X_train_select, y_train, X_test_select, y_test)

# GridSearch for C
# Set the parameters by cross-validation
Exemplo n.º 27
0
def plot_sgd_comparison():
    """
    ==================================
    Comparing various online solvers
    ==================================

    An example showing how different online solvers perform
    on the hand-written digits dataset.

    """

    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import datasets

    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import SGDClassifier, Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    from sklearn.linear_model import LogisticRegression

    heldout = [0.95, 0.90, 0.75, 0.50, 0.01]
    rounds = 20
    digits = datasets.load_digits()
    X, y = digits.data, digits.target

    classifiers = [("SGD", SGDClassifier(max_iter=100)),
                   ("ASGD", SGDClassifier(average=True, max_iter=100)),
                   ("Perceptron", Perceptron(tol=1e-3)),
                   ("Passive-Aggressive I",
                    PassiveAggressiveClassifier(loss='hinge', C=1.0,
                                                tol=1e-4)),
                   ("Passive-Aggressive II",
                    PassiveAggressiveClassifier(loss='squared_hinge',
                                                C=1.0,
                                                tol=1e-4)),
                   ("SAG",
                    LogisticRegression(solver='sag',
                                       tol=1e-1,
                                       C=1.e4 / X.shape[0]))]

    xx = 1. - np.array(heldout)

    for name, clf in classifiers:
        print("training %s" % name)
        rng = np.random.RandomState(42)
        yy = []
        for i in heldout:
            yy_ = []
            for r in range(rounds):
                X_train, X_test, y_train, y_test = \
                    train_test_split(X, y, test_size=i, random_state=rng)
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                yy_.append(1 - np.mean(y_pred == y_test))
            yy.append(np.mean(yy_))
        plt.plot(xx, yy, label=name)

    plt.legend(loc="upper right")
    plt.xlabel("Proportion train")
    plt.ylabel("Test Error Rate")
    plt.show()
Exemplo n.º 28
0
                                          pred,
                                          target_names=categories))

    if print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"),
                   "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"),
                  (PassiveAggressiveClassifier(n_iter=50),
                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(
        benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
Exemplo n.º 29
0
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1, 1)
accuracy_score(y_test, y_pred)

# Perceptron
from sklearn.linear_model import Perceptron

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1, 1)
accuracy_score(y_test, y_pred)

# PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1, 1)
accuracy_score(y_test, y_pred)
'''Fuzzy Logic Classifier'''
'''Cross Validation'''
'''# Undersampling per cross_val
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
# summarize class distribution
print(Counter(y))
# define dataset
X, y = make_classification(n_samples=30000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.333333,0.333333,0.333334],n_classes= 3, flip_y=0, random_state=100)
undersample = NearMiss(version=1, n_neighbors_ver3=3)
Exemplo n.º 30
0
max_features = 1000
ngram_range = (1,5) #if not specified its (1,1)
countvec=CountVectorizer(min_df = min_df, ngram_range = ngram_range, max_features=max_features)
# Learn vocabulary from train set
countVec.fit(trainText)

# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(trainText)
devX = countVec.transform(devText)
testX = countVec.transform(testText)

print("Shape of Train X {}\n".format(trainX.shape))
print("Sample of the vocab:\n {}".format(np.random.choice(countVec.get_feature_names(), 20)))
#%% PICK A MODEL AND EXPERIMENT
lr = LogisticRegression(C=0.1)
passAgg    = PassiveAggressiveClassifier(C=0.1)
perceptron = Perceptron()

lr.fit(trainX, trainY)
print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

passAgg.fit(trainX, trainY) 
print("Passive Aggressive Train:", passAgg.score(trainX, trainY))
print("Passive Aggressive Dev:", passAgg.score(devX, devY))
print("--")

perceptron.fit(trainX, trainY) 
print("Perceptron Train:", perceptron.score(trainX, trainY))
print("Perceptron Dev:", perceptron.score(devX, devY))
                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
for stat in content:
    ar=stat.split(" ")
    
    labels.append(ar[0])
    ar=ar[1:]

    
    s=' '.join(ar)
#    print(s)
    texts.append(s)

vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
X_train = vectorizer.fit_transform(texts)
Y_train = np.array(labels)


    
predict_data=[]
n=int(input().strip())
for i in range(n):
    predict_data.append(input().strip())
X_test  = vectorizer.transform(predict_data) 


clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(X_train, Y_train)
results=clf.predict(X_test)

for i in results:
    print(i)
Exemplo n.º 33
0
print "Accuracy", scores.mean()

print "\nUsing nearest centroid"
nc = NearestCentroid()
scores = cross_val_score(nc, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()

print "\nnusvc"
nusvc = NuSVC()
scores = cross_val_score(nusvc, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()

print "\nUsing Passive aggressive Classifier"
pac = PassiveAggressiveClassifier()
scores = cross_val_score(pac, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()

print "\nUsing the perceptron"
per = Perceptron(fit_intercept=False, n_iter=10, shuffle=False)
scores = cross_val_score(per, feature_normal, labels, cv=10, n_jobs=4)
print scores
print "Accuracy", scores.mean()

# This hangs my computer for some reason
#print "\n Using quadratic discriminant analysis"
#qda = QuadraticDiscriminantAnalysis(store_covariances=True)
#scores = cross_val_score(qda, feature_normal, labels, cv=10, n_jobs = 2)
#print scores
Exemplo n.º 34
0
# print (y_train)
# k_t, test_y, a_t = read_pan(pan_test)
print('Extracting asset')
train_x, f_names, chi, transformer = feature_extraction2(givenlabel, k)
# print ('Writing database ...')
# writeTrainToTxt(train_x,'feature.txt')
# # writeTrainToTxt(y_train,'y.txt')
# print ('written')

train_X, test_X, train_y, test_y = train_test_split(train_x,
                                                    y_train,
                                                    train_size=0.80)
print('Mission successful')
# print(len(X_train))
# test_x,_,_,_=feature_extraction2(givenlabel,k_t)
# train_x=train_x.toarray().astype(np.float)
# print (train_x.dtype)

# train_x,f_names,chi,transformer=feature_extraction(givenlabel,k)
# print(simple_classify(RandomForestClassifier(),test_X,test_y,train_X,train_y))
print(simple_classify(RidgeClassifier(), test_X, test_y, train_X, train_y))
print(simple_classify(Perceptron(), test_X, test_y, train_X, train_y))
print(
    simple_classify(PassiveAggressiveClassifier(), test_X, test_y, train_X,
                    train_y))
print(
    simple_classify(RandomForestClassifier(), test_X, test_y, train_X,
                    train_y))
print(simple_classify(KNeighborsClassifier(), test_X, test_y, train_X,
                      train_y))
print(simple_classify(MultinomialNB(), test_X, test_y, train_X, k))
Exemplo n.º 35
0
            if each_word not in stop_words
        ]
        text_lmtzr = [lmtzr.lemmatize(each_word) for each_word in text_clean]
        training_text.append(' '.join(text_lmtzr))

X_vector = vectorizer.fit_transform(training_text)

print "  Actual number of tfidf features: %d" % X_vector.get_shape()[1]
# raw_input()

svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))

X_train_lsa = lsa.fit_transform(X_vector)

passive_tfidf = PassiveAggressiveClassifier(n_iter=50)
passive_tfidf.fit(X_vector, training_class)

passive_lsa = PassiveAggressiveClassifier(n_iter=50)
passive_lsa.fit(X_train_lsa, training_class)

file_Name = "global_intent_tfidf.p"
fileObject = open(file_Name, 'wb')
pickle.dump(passive_tfidf, fileObject)
fileObject.close()

file_Name_lsa = "global_intent_lsa.p"
fileObject_lsa = open(file_Name_lsa, 'wb')
pickle.dump(passive_lsa, fileObject_lsa)
fileObject_lsa.close()
Exemplo n.º 36
0
    #     parser.add_argument('--n_estimators', type=int, default=100)
    #     parser.add_argument('--min_samples_leaf', type=int, default=3)

    # args holds all passed-in arguments
    args = parser.parse_args()

    # Read in csv training file
    training_dir = args.data_dir
    train_data = pd.read_csv(os.path.join(training_dir, "train.csv"),
                             header=None,
                             names=None)

    # Labels are in the first column
    train_y = train_data.iloc[:, 0]
    train_x = train_data.iloc[:, 1:]

    ## --- Your code here --- ##

    ## TODO: Define a model
    model = PassiveAggressiveClassifier(max_iter=args.iter,
                                        random_state=0,
                                        tol=1e-3,
                                        validation_fraction=args.valid_frac)

    ## TODO: Train the model
    model.fit(train_x, train_y)

    ## --- End of your code  --- ##

    # Save the trained model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
Exemplo n.º 37
0
from sklearn.metrics import accuracy_score, confusion_matrix

#Read the data
df = pd.read_csv("news.csv")
#Get shape and head
shape = df.shape
print(shape)
head1 = df.head()
print(head1)
labels = df.label
print(labels)

#Split the dataset
x_train, x_test, y_train, y_test = train_test_split(df['text'],
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=7)
#Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)
#Initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=100)
pac.fit(tfidf_train, y_train)
#Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']))
        stack[:, i], 6)
df_stack.to_csv(path +
                'feature/advertiser_id_tfidf_sgd_error_single_classfiy.csv',
                index=None,
                encoding='utf8')
print('sgd特征已保存\n')

########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((train_feature.shape[0], number))
stack_test = np.zeros((test_feature.shape[0], number))
score_va = 0

for i, (tr, va) in enumerate(kfold.split(train_feature, score)):
    print('stack:%d/%d' % ((i + 1), n_folds))
    pac = PassiveAggressiveClassifier(random_state=RANDOM_SEED)
    pac.fit(train_feature[tr], score[tr])
    score_va = pac._predict_proba_lr(train_feature[va])
    score_te = pac._predict_proba_lr(test_feature)
    print(score_va)
    print('得分' +
          str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
    stack_train[va] += score_va
    stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
    df_stack['advertiser_id_tfidf_pac_classfiy_{}'.format(i)] = np.around(
        stack[:, i], 6)
df_stack.to_csv(path +
def get_best_model(x_tweet_data,y_tweet_data, x_news_data, y_news_data):

    x_tweet_data = prepare_data(x_tweet_data)
    x_news_data = prepare_data(x_news_data)


    tf_idf_vectorizer = get_tf_idf_vectorizer(x_tweet_data)

    x_tweet_data = tf_idf_vectorizer.transform(x_tweet_data)
    x_news_data = tf_idf_vectorizer.transform(x_news_data)

    y_tweet_data = [class_mapping.get(elem[0]) for elem in y_tweet_data]
    y_news_data = [class_mapping.get(elem[0]) for elem in y_news_data]

    f1_coll = []
    model_coll = []

    ########################### FOR SVC

    svc_model = SVC(C=1, kernel='linear', degree=4, gamma='auto', coef0=0.0, shrinking=True,
                probability=False, tol=0.001, cache_size=200, class_weight='balanced', verbose=False,
                max_iter=-1, decision_function_shape='ovr', random_state=None)

    fitted_model, p,r,f,acc= best_model_selection(x_tweet_data, y_tweet_data, x_news_data,
                                                                        y_news_data, model_name='SVC',
                                                                        curr_model=svc_model, lower_lim=50,
                                                                        upper_lim=5000, step=50)

    f1_coll.append(f)
    model_coll.append(fitted_model)

    print('#################### SVC : ')
    print('precision : ',p)
    print('recall : ',r)
    print('max_f1: ', f)
    print('acc: ', acc)
    f1_coll.append(f)
    model_coll.append(fitted_model)


    ########################### FOR SGD

    sgd_model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True,
                              max_iter=None, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1,
                              random_state=None,
                              learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False,
                              average=False, n_iter=None)

    fitted_model, p,r,f,acc= best_model_selection(x_tweet_data, y_tweet_data, x_news_data,
                                                                        y_news_data, model_name='SGD',
                                                                        curr_model=sgd_model, lower_lim=50,
                                                                        upper_lim=5000, step=50)
    f1_coll.append(f)
    model_coll.append(fitted_model)

    print('#################### SGD : ')
    print('precision : ',p)
    print('recall : ',r)
    print('max_f1: ', f)
    print('acc: ', acc)


    ########################### FOR PA

    pa_model = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, max_iter=None, tol=0.001,
                                            shuffle=True, verbose=0, loss='hinge', n_jobs=1, random_state=None,
                                            warm_start=False,
                                            class_weight='balanced', average=True, n_iter=None)

    fitted_model,p,r,f,acc= best_model_selection(x_tweet_data, y_tweet_data, x_news_data,
                                                                        y_news_data, model_name='PA',
                                                                        curr_model=pa_model, lower_lim=50,
                                                                        upper_lim=5000, step=50)

    f1_coll.append(f)
    model_coll.append(fitted_model)

    print('#################### PA : ')
    print('precision : ', p)
    print('recall : ', r)
    print('max_f1: ', f)
    print('acc: ', acc)

    best_model_ind = np.argmax(f1_coll)


    return tf_idf_vectorizer, model_coll[best_model_ind]
Exemplo n.º 40
0
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

clf = MultinomialNB()
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

# Testing
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(n_iter=50)
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

clf = MultinomialNB(alpha=0.1)
last_score = 0
for alpha in np.arange(0, 1, .1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    if score > last_score:
 def __init__(self, cached_features):
     BaseModel.__init__(self, cached_features)
     self.model = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, random_state=1)
def tweetread():   
    data = []
    catagory = []
    
    results_traffic = collection_aa.find({"manualtype":{"$ne":"/^non*/"}}) 
    for i,item in enumerate(results_traffic):
        text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8')
        text = re.sub(r"@([A-Za-z]+[A-Za-z]+[A-Za-z0-9-_\.]+)", "", text)
        print(text)
        data.append(str(text))
        catagory.append(0)
    results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/18/09*"}}) 
    nontraffic = []
    data = data[:5000]
    catagory = catagory[:5000]
    #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic]
    print(len(data), " TRAFFIC SIZE ")
    
    for res in results_nontraffic:
        #print(len(res["item"]))
        for i in res["item"]:
            if len(data) < 10000:
                text = unicodedata.normalize('NFKD', i["text"]).encode('ascii','ignore').decode('utf-8')
                #if not check_in(['delays', 'crash', 'cleared'] , text):
                text = re.sub(r"@([A-Za-z]+[A-Za-z0-9-_\.]+)", "", text)
                print(text)
                data.append(text)
                catagory.append(1)
                #else:
                #    print(text)
    print(len(data), "SAMPLE SIZE ")
    vectorizer =  TfidfVectorizer(
        analyzer='word',  # features made of words
        token_pattern=r'[a-z]{3,}',
        use_idf=True,
        strip_accents='unicode',
        #ngram_range=(2,3),
        sublinear_tf=True, max_df=0.95, min_df=0.05,stop_words='english')
    #vectorizer =  DictVectorizer();
   
    X_train = vectorizer.fit_transform(data)
    X_test = vectorizer.transform(data)
    feature_names = vectorizer.get_feature_names()#np.vectorize(vectorizer.get_feature_names())
    print(feature_names);
    print(X_test)
    print(data[0])
    print(data[1])
    
    
    #BernoulliNB(alpha=.01)
    #nb_classifier = BernoulliNB(alpha=.01).fit(X_train, catagory)
    #nb_classifier = RidgeClassifier(tol=1e-2, solver="lsqr").fit(X_train, catagory)
    #nb_classifier = Perceptron(n_iter=50).fit(X_train, catagory)
    nb_classifier = PassiveAggressiveClassifier(n_iter=50).fit(X_train, catagory)
    #nb_classifier = MultinomialNB(alpha=.01).fit(X_train, catagory)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    print("Dimensionality: %d" % nb_classifier.coef_.shape[0])
    show_most_informative_features(vectorizer, nb_classifier, n=50)
    print("traffic     :"  + str(traffic_label))
    print("traffic score    #:"  + str(traffic_scores))
    print("non  :"  + str(nontraffic_label))        
    print("non score #:"  + str(nontraffic_scores))
    
  
    
    print("MODEL: Multinomial Naive Bayes\n")
    
    print('The precision for this classifier is ' + str(metrics.precision_score(catagory, y_nb_predicted)));
    print('The recall for this classifier is ' + str(metrics.recall_score(catagory, y_nb_predicted)));
    print('The f1 for this classifier is ' + str(metrics.f1_score(catagory, y_nb_predicted)));
    print('The accuracy for this classifier is ' + str(metrics.accuracy_score(catagory, y_nb_predicted)));
    
    print('\nHere is the classification report:');
    print(classification_report(catagory, y_nb_predicted));
    print(metrics.confusion_matrix(catagory, y_nb_predicted, labels=[0,1]))
    
    results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/*"}}) 
    nontraffic = []
    data = data[:1000]
    catagory = catagory[:1000]
    #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic]
    print(len(data), " TRAFFIC SIZE ")
    
    f = open('classifier.pickle', 'wb')
    v = open('vector.pickle', 'wb')
    pickle.dump(nb_classifier, f)
    pickle.dump(vectorizer, v)
    f.close()
    
    for res in results_nontraffic:
        for item in res["item"]:
            text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8')
            X_test = vectorizer.transform([text])
            y_nb_predicted = nb_classifier.predict(X_test)
            #score = metrics.f1_score(X_test, y_nb_predicted)
            if y_nb_predicted == 0:
                #if check_in(['delays', 'crash', 'cleared'] , text):
                #print("PREDICTED", text)
                print("", text,"\\\\")
Exemplo n.º 43
0
                                          target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((RidgeClassifier(tol=1e-2, solver="sag"),
                   "Ridge Classifier"), (Perceptron(max_iter=50,
                                                    tol=1e-3), "Perceptron"),
                  (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),
                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN"),
                  (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
class DefaultConfig(object):
    """
    参数配置
    """
    def __init__(self):
        pass

    # 次数
    k = 5

    # 项目路径
    project_path = '/'.join(os.path.abspath(__file__).split('/')[:-2])
    # 停用词文件路径
    stopwords_path = project_path + '/data/stopwords/stopwords.txt'
    # app_desc.dat 路径
    app_desc_path = project_path + '/data/original/app_desc.dat'
    # apptype_id_name.txt 路径
    apptype_id_name_path = project_path + '/data/original/apptype_id_name.txt'
    # apptype_train.dat 路径
    apptype_train_path = project_path + '/data/original/apptype_train.dat'
    # apptype_train_term_doc.h5文件保存路径
    apptype_train_term_doc_path = project_path + '/data/cache/apptype_train_term_doc.h5'
    # app_desc_term_doc.h5文件路径
    app_desc_term_doc_path = project_path + '/data/cache/app_desc_term_doc.h5'
    # app_desc_apptype 对app_desc进行预判断
    app_desc_apptype_path = project_path + '/data/cache/app_desc_apptype.h5'

    # apptype_train_classification.h5文件路径
    apptype_train_classification_path = project_path + '/data/cache/apptype_train_classification.h5'
    # app_desc_classification.h5文件路径
    app_desc_classification_path = project_path + '/data/cache/app_desc_classification.h5'

    # apptype_train_word_index.h5
    apptype_train_word_index_path = project_path + '/data/cache/apptype_train_word_index.h5'
    # app_desc_word_index.h5
    app_desc_word_index_path = project_path + '/data/cache/app_desc_word_index.h5'


    # 单模型
    AdaBoostClassifier_model = AdaBoostClassifier()
    BaggingClassifier_model = BaggingClassifier()
    ExtraTreesClassifier_model = ExtraTreesClassifier()
    GradientBoostingClassifier_model = GradientBoostingClassifier()
    RandomForestClassifier_model = RandomForestClassifier()
    GaussianProcessClassifier_model = GaussianProcessClassifier()
    PassiveAggressiveClassifier_model = PassiveAggressiveClassifier()
    RidgeClassifier_model = RidgeClassifier(alpha=0.8, tol=0.1, solver="sag", normalize=True, max_iter=1000, random_state=2019)
    SGDClassifier_model = SGDClassifier()
    KNeighborsClassifier_model = KNeighborsClassifier()
    GaussianNB_model = GaussianNB()
    MLPClassifier_model = MLPClassifier()
    DecisionTreeClassifier_model = DecisionTreeClassifier()
    ExtraTreeClassifier_model = ExtraTreeClassifier()
    SVC_model = SVC()
    LinearSVC_model = LinearSVC()
    # XGBClassifier_model = XGBClassifier()
    # LGBMClassifier_model = LGBMClassifier()
    LinearClassifierMixin_model = LinearClassifierMixin()
    RidgeClassifierCV_model = RidgeClassifierCV()
    SparseCoefMixin_model = SparseCoefMixin()

    # 选中的模型
    select_model = RidgeClassifier_model
    # select_model = 'lgb'
    # select_model = 'fast_text'

    # replace 是否进行替换
    not_replace = False
    if len(line)>2:
        targets.append(int(line[0]))
        docs.append(' '.join([i for i in line[1:] if not is_stopword(i)]))



count_vect = CountVectorizer(input='content',ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(docs)
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
#svd = TruncatedSVD(n_components=55, random_state=7)
#X_train = svd.fit_transform(X_train_tf)
#clf = KNeighborsClassifier(n_neighbors=8).fit(X_train, targets)
#clf = BernoulliNB(alpha=.01)
#clf = LinearSVC()
clf=PassiveAggressiveClassifier(n_iter=9)
clf.fit(X_train_tf, targets)

def classify(content):
    global count_vect
    global tf_transformer
    global svd
    global clf
    X_new_counts = count_vect.transform(content)
    X_new_tfidf = tf_transformer.transform(X_new_counts)
    #X_new = svd.transform(X_new_tfidf)
    return clf.predict(X_new_tfidf)

tc = int(raw_input())
inp = []
for tcc in range(tc):
Exemplo n.º 46
0
classifiers = {
    'keras_mlp': KerasClassifier(
        build_fn=create_mlp,
        nb_epoch=150,
        batch_size=64
    ),
    'svc_linear': LinearSVC(),
    'lr_lbfgs': LogisticRegression(
        C=2.02739770e+04,  # particle swarm optimised
        tol=6.65926091e-04,
        solver='lbfgs'
    ),
    'lr_lbfgs_default': LogisticRegression(solver='lbfgs'),
    'pa': PassiveAggressiveClassifier(
        C=0.01,
        fit_intercept=True,
        loss='hinge'
    ),
    'pa_default': PassiveAggressiveClassifier(),
    'gnb': GaussianNB(),
    'lda': LinearDiscriminantAnalysis(),
    'rf': RandomForestClassifier(
        n_estimators=200,
        criterion='gini',
        max_depth=4,
        min_samples_leaf=3,
        min_samples_split=3
    ),
    'xgb': XGBClassifier(
        n_estimators=200,
        max_depth=6,
Exemplo n.º 47
0
X_train1, X_test1, y_train, y_test = train_test_split(
    train['text'], train['sentiment'], test_size=0.1)

X_test = count_vect.fit_transform(X_test1)
X_train = count_vect.transform(X_train1)

clf = MLPClassifier(alpha=1, random_state=65)
clf.fit(X_train, y_train)

clf2 = SVC(probability = True, gamma=2, C=1)
clf2.fit(X_train, y_train)

clf3 = DecisionTreeClassifier(random_state = 0)
clf3.fit(X_train, y_train)

clf4 = PassiveAggressiveClassifier()
clf4.fit(X_train, y_train)

clf5 = BaggingClassifier(random_state=54)
clf5.fit(X_train, y_train)

clf6 = ExtraTreesClassifier(random_state=0)
clf6.fit(X_train, y_train)

clf7 = GradientBoostingClassifier(random_state=32)
clf7.fit(X_train, y_train)

vc = VotingClassifier(estimators=[
    ('mlp', clf), ('dt', clf3), ('et', clf6), ('bag', clf5), ('grad', clf7)
], voting='soft', weights=[0.3, 0.1, 0.2, 0.1, 0.3])
vc.fit(X_train, y_train)
def runLearner(printStages = True, useSelector = False, discreteHelpfulness = True, useRST = True, useFew = False):
    learner = PassiveAggressiveClassifier() if discreteHelpfulness else PassiveAggressiveRegressor()
    #bestwords = getBestWords(instances,num=1000)
    tfidvec = TfidfVectorizer(sublinear_tf=True,stop_words='english', ngram_range=(1,3), decode_error='replace')
    selector = SelectKBest(chi2, k=50000) if useSelector else None
    encoder = LabelEncoder() if discreteHelpfulness else None
    if discreteHelpfulness:
        classlabels = encoder.fit_transform(labels)
    newData = False

    count = 0
    if useRST:
      print 'Getting RST data'
      nums, texts, ilabels = getPickledRSTSciKitDataLists(True) if newData else getRSTSciKitDataLists(True)

      random = RandomFeatureExtractor()
      lengthBaseline = LenFeatureExtractor()
      fullRST = FullPickledRSTFeatureExtractor(nums)  if newData else FullTextRSTFeatureExtractor(nums)
      limitedRST = LimitedPickledRSTFeatureExtractor(nums)  if newData else LimitedTextRSTFeatureExtractor(nums)
      vectorizer =  FeatureUnion([('extra',limitedRST),('tfid',tfidvec)])

      print 'Fitting random features baseline'
      random.fit(texts)
      print 'Fitting text length baseline'
      lengthBaseline.fit(texts)
      print 'Fitting full RST features'
      fullRST.fit(texts)
      print 'Fitting limited RST features'
      limitedRST.fit(texts)
      print 'Fitting limited RST with tfidvec features'
      vectorizer.fit(texts)
      print 'Fitting tfidvec features'
      tfidvec.fit(texts)

      split = int(0.8*len(ilabels))
      trainData = (texts[:split],ilabels[:split])
      testData = (texts[split:],ilabels[split:])      

      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      print 'random features baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      dummy = DummyClassifier()
      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      dummy.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],random,encoder,selector)
      print 'Dummy label distribution baseline trained on %d instances has accuracy %f'%(len(trainData[0]),dummy.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],lengthBaseline,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],lengthBaseline,encoder,selector)
      print 'text length baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],fullRST,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],fullRST,encoder,selector)
      print 'Full RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],limitedRST,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],limitedRST,encoder,selector)
      print 'Limited RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],vectorizer,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],vectorizer,encoder,selector)
      print 'Limited RST with ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],tfidvec,encoder,selector)
      learner = learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],tfidvec,encoder,selector)
      print 'ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))


    else:
      vectorizer = tfidvec
      testData = None
      vocabGotten = False
      instances = ([],[])
      numVocab = 50000
      numTest = 50000
      numTrain = 100000
      maxTrainStages = 20
      for text,label in getSciKitData(stateProgress = False, discreteLabels=discreteHelpfulness):
          if label!='few' or useFew:
            instances[0].append(text)
            instances[1].append(label)
            if not vocabGotten and len(instances[0]) == numVocab:
                if printStages:
                    print 'Fitting vocabulary with %d instances'%numVocab
                vectorizer.fit(instances[0],None)
                if selector is not None:
                    X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,None)
                    selector.fit(X,y)
                vocabGotten = True
                instances = ([],[])
            elif vocabGotten and testData is None and len(instances[0]) == numTest:
                if printStages:
                    print 'Getting test data with %d instances'%numTest
                testData = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector)
                instances = ([],[])
            elif vocabGotten and testData is not None and len(instances[0]) == numTrain:
                X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector)
                if discreteHelpfulness:
                    learner = learner.partial_fit(X,y, classes = classlabels)
                else:
                    learner = learner.partial_fit(X,y)
                instances = ([],[])
                count = count + 1
                if printStages:
                    print 'Baseline trained on %d instances has accuracy %f'%(count*numTrain,learner.score(testData[0],testData[1]))
            elif count == maxTrainStages:
                break
      print 'Final learner trained on %d instances has accuracy %f'%(maxTrainStages*numTrain,learner.score(testData[0],testData[1]))
Exemplo n.º 49
0

tweets = pd.read_csv('train_data_flag.csv',
                     header=None,
                     skiprows=1,
                     names=["name","id","description","friends","followers","location","tweet","flag"])


tweets.loc[(tweets['flag'] == 1) , ['flag']] = 'DEPRESSED'
tweets.loc[(tweets['flag'] == 0) , ['flag']] = 'NORMAL'

features=tweets['tweet']
labels=tweets['flag']

x_train,x_test,y_train,y_test=train_test_split(features, labels, test_size=0.2, random_state=42)

tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_vectorizer.fit(x_train)
tfidf_train=tfidf_vectorizer.transform(x_train)
tfidf_test=tfidf_vectorizer.transform(x_test)

pa_classifier=PassiveAggressiveClassifier(max_iter=50)
pa_classifier.fit(tfidf_train,y_train)

y_pred=pa_classifier.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#Save Model
joblib.dump(tfidf_vectorizer,"tfidf_vectorizer.pkl")
joblib.dump(pa_classifier, "pa_classifier.pkl")
Exemplo n.º 50
0
### Treinamento

batch_size = 1000
n_pairs = 516000
train_set_size = 510000
n_batches = int(train_set_size / batch_size)
classes = [0, 1]

classifiers = [
    ("clfP", Perceptron(penalty = None)),
    ("clfPL1", Perceptron(penalty = 'l1')),
    ("clfSGDCL1", SGDClassifier(learning_rate = 'constant', penalty = 'l1', loss = 'hinge', eta0 = 1)),
    ("clfSGDO", SGDClassifier(learning_rate = 'optimal', penalty = 'l2', loss = 'hinge')), # SGD()
    ("clfSGDOL1", SGDClassifier(learning_rate = 'optimal', penalty = 'l1', loss = 'hinge')),
    ("clfSGDOLOG", SGDClassifier(learning_rate = 'optimal', penalty = 'l2', loss = 'log', eta0 = 1)),
    ("clfPAH", PassiveAggressiveClassifier(loss = 'hinge')),
    ("clfPASH", PassiveAggressiveClassifier(loss = 'squared_hinge'))
    # Removidos:
    #("clfPL2", Perceptron(penalty = 'l2')),
    #("clfSGDC", SGDClassifier(learning_rate = 'constant', penalty = 'l2', loss = 'hinge', eta0 = 1)),
    #("clfSGDCLOG", SGDClassifier(learning_rate = 'constant', penalty = 'l2', loss = 'log', eta0 = 1)),
]

log('n_pairs: ' + str(n_pairs) + ' train_set_size: ' + str(train_set_size) + ' n_batches: ' + str(n_batches) + ' batch_size: ' + str(batch_size))

for j in range(n_batches):
    start_batch = time.time()
    X = pickle.load(open('.\Batches\X.' + str(j), 'rb'))
    y = pickle.load(open('.\Batches\y.' + str(j), 'rb'))
    for name, clf in classifiers:
        clf.partial_fit(X, y, classes)
Exemplo n.º 51
0
def main():

    X_training, X_test, y_training, y_test, featureNames, classValue, name = loadTrainingAndTest(
    )

    print("Pre-processing data...")

    dfFeaturesTest = read_csv("./data/Affy2Weights.csv", header=None)

    values = dfFeaturesTest.values[:, 0]

    print(values)
    for i in range(0, len(X_test[0])):
        for j in range(0, len(X_test)):
            X_test[j, i] = X_test[j, i] * values[i]

    if True:
        # let's normalize the data by sample
        scaler_sample = StandardScaler()
        scaler_sample2 = StandardScaler()
        X_training = scaler_sample.fit_transform(X_training.T).T
        X_test = scaler_sample2.fit_transform(X_test.T).T

    if True:
        scaler = StandardScaler()
        X_training = scaler.fit_transform(X_training)
        X_test = scaler.transform(X_test)

    # also normalize by feature

    # time to plot a PCA
    if True:
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        pca.fit(X_training)
        X_pca_train = pca.transform(X_training)
        X_pca_test = pca.transform(X_test)

        import matplotlib.pyplot as plt
        fig = plt.figure()
        ax = fig.add_subplot(111)

        class3 = [(y == classValue) for y in y_training]

        ax.plot(X_pca_train[:, 0], X_pca_train[:, 1], 'b.', label="tcga data")
        ax.plot(X_pca_train[class3, 0],
                X_pca_train[class3, 1],
                color='orange',
                marker='.',
                linestyle='None',
                label=("tcga data, class " + str(classValue)))
        ax.plot(X_pca_test[:, 0], X_pca_test[:, 1], 'r.', label=name + " data")

        ax.legend(loc='best')
        ax.set_title("TCGA vs " + name + " data")
        ax.set_xlabel("PCA 0")
        ax.set_ylabel("PCA 1")

        plt.savefig(name + ".png")

    results = np.zeros(shape=(X_test.shape[0] * 9, 10))
    results2 = np.zeros(shape=(9, 10))
    j = 0
    for k in range(0, 10):

        ## FINALLY, WE CAN CLASSIFY AWAY!
        classifierList = [

            #[RandomForestClassifier(), "RandomForestClassifier()"],
            [LogisticRegression(), "LogisticRegression"],  # coef_
            [PassiveAggressiveClassifier(),
             "PassiveAggressiveClassifier"],  # coef_
            [SGDClassifier(), "SGDClassifier"],  # coef_
            [
                SVC(kernel='linear'), "SVC(linear)"
            ],  # coef_, but only if the kernel is linear...the default is 'rbf', which is NOT linear
            [RidgeClassifier(), "RidgeClassifier"],  # coef_
            [
                BaggingClassifier(n_estimators=300),
                "BaggingClassifier(n_estimators=300)"
            ],
            [
                GradientBoostingClassifier(n_estimators=300),
                "GradientBoostingClassifier(n_estimators=300)"
            ],
            [
                RandomForestClassifier(n_estimators=300),
                "RandomForestClassifier(n_estimators=300)"
            ],
        ]

        f2 = open(name + "_" + str(k) + "_.txt", 'w')
        f = open(name + "_" + str(k) + ".txt", 'w')
        l = 0
        ##for i in range(0, 10) :
        for originalClassifier, classifierName in classifierList:
            f.write("\nClassifier " + classifierName + "\n")
            f2.write("\nClassifier " + classifierName + "\n")
            print("\nClassifier " + classifierName)
            ##let's normalize, anyway
            classifier = copy.deepcopy(originalClassifier)
            classifier.fit(X_training, y_training)
            y_train_pred = classifier.predict(X_training)
            y_test_pred = classifier.predict(X_test)

            scoreTraining = accuracy_score(y_train_pred, y_training)
            scoreTest = accuracy_score(y_test_pred, y_test)

            f.write("Training accuracy: %.4f; Test accuracy: %.4f \n" %
                    (scoreTraining, scoreTest))
            f2.write("Training accuracy: %.4f; Test accuracy: %.4f \n" %
                     (scoreTraining, scoreTest))
            print("Training accuracy: %.4f; Test accuracy: %.4f" %
                  (scoreTraining, scoreTest))
            f.write("Complete classification on test: \n")
            for i in range(0, y_test.shape[0]):
                f.write("%d \n" % (y_test_pred[i]))
                results[j, k] = y_test_pred[i]
                j = j + 1
            results2[l, k] = scoreTest
            l = l + 1
        f.close()
        f2.close()
        j = 0
    saveMatrix(name + "_results.csv", results)
    saveMatrix(name + "results2.csv", results2)
    return
   #  "there there there"]
#y = [1,1,1,1]
trans = vectorizer.fit_transform(x)
#print  vectorizer.transform(["I am in a tree tree"]).toarray()
#print vectorizer.get_feature_names()
#print trans.toarray()
#print sorted(vectorizer.vocabulary_)
print len(vectorizer.vocabulary_)


K = 1
from sklearn import neighbors
#clf = neighbors.KNeighborsClassifier(K,weights = 'distance', leaf_size= 30)

from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(trans, y)

#f = open("testDatatextClassification.txt",'r')
f = open("input01.txt",'r')
f2 = open("output01.txt","r")
d = f.readlines()
d = d[1:]
ans = map(int,f2.readlines())
t0= time.clock()

summing = 0;
for j,i in enumerate(d):
    sol = int(clf.predict(vectorizer.transform([i]).toarray())[0])
    #print sol, ans[j]
    if (sol==ans[j]):
Exemplo n.º 53
0
                     default='squared_hinge',
                     help='loss function for model')
 parser.add_argument('-v',
                     '--validate',
                     action='store_true',
                     help='do validation')
 parser.add_argument(
     '--avg',
     type=int,
     help=
     'average this # of models together, trained on different data order')
 start = datetime.now()
 args = parser.parse_args()
 print args
 model = PassiveAggressiveClassifier(C=args.C,
                                     loss=args.loss,
                                     warm_start=True)
 print 'Using model:'
 print model
 if args.validate:
     if args.avg is None:
         validate(model, args.infile, args.passes, args.bits)
     else:
         avg_validate(args.avg, model, args.infile, args.passes, args.bits)
 else:
     if args.avg is None:
         run_all(model, args.infile, args.passes, args.bits, args.submit_id)
     else:
         avg_run_all(args.avg, model, args.infile, args.passes, args.bits,
                     args.submit_id)
 finish = datetime.now()
Exemplo n.º 54
0
# Splitting the dataset into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 0)

# TfidfVectorizer
# Initialize 
from sklearn.feature_extraction.text import TfidfVectorizer
TDfVector = TfidfVectorizer(stop_words = 'english', max_df = 0.7)
# Fitting the 
TDfVector_train = TDfVector.fit_transform(X_train)
TDfVector_test = TDfVector.transform(X_test)

# PassiveAgressiveClassifer 
#initialize
from sklearn.linear_model import PassiveAggressiveClassifier
PAClassifier = PassiveAggressiveClassifier(max_iter=50)
PAClassifier.fit(TDfVector_train,y_train)

# Prediction 
y_pred = PAClassifier.predict(TDfVector_test)

#Accuracy in %
from sklearn.metrics import accuracy_score, confusion_matrix
Score = accuracy_score(y_test,y_pred)
final = Score*100
print(f'Accuracy Score : {round(final)}%')
# Confusion Matrix
cm = confusion_matrix(y_test,y_pred)


Exemplo n.º 55
0
def model_selection(X, y, X_pred, donation_columns, cat_col,
                    no_donation_columns, skewed_target_value):
    models = [
        {
            'label': 'LogisticRegression',
            'model': LogisticRegression()
        },
        {
            'label': 'RidgeClassifier',
            'model': RidgeClassifier()
        },  # No predict_proba
        {
            'label': 'MultinomialNB',
            'model': MultinomialNB()
        },
        {
            'label': 'ComplementNB',
            'model': ComplementNB()
        },
        {
            'label': 'BernoulliNB',
            'model': BernoulliNB()
        },
        {
            'label': 'DecisionTreeClassifier',
            'model': DecisionTreeClassifier()
        },
        {
            'label': 'SGDClassifier',
            'model': SGDClassifier(loss='log')
        },
        {
            'label': 'PassiveAggressiveClassifier',
            'model': PassiveAggressiveClassifier()
        },  # No predict_proba
        {
            'label': 'LinearSVC',
            'model': LinearSVC()
        },  # No predict_proba
        {
            'label': 'RandomForestClassifier',
            'model': RandomForestClassifier()
        }
    ]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    if (not no_donation_columns) & (not skewed_target_value):
        pdf.ln(1)
        pdf.multi_cell(
            h=5.0,
            w=0,
            txt="              a. 80% ({}) of data used for training the model"
            .format(convert_number_format(X_train.shape[0])))
        pdf.ln(0.5)
        pdf.multi_cell(
            h=5.0,
            w=0,
            txt="              b. 20% ({}) of data used for testing the model".
            format(convert_number_format(X_test.shape[0])))
        pdf.ln(0.5)

    test_list = [chr(x) for x in range(ord('a'), ord('z') + 1)]
    if no_donation_columns:
        pdf.multi_cell(
            h=5.0,
            w=0,
            txt=
            "     2. Donation Columns: The uploaded donor file is missing donation "
            "information (amount) to show donation column(s).")
        pdf.ln(0.5)
    elif skewed_target_value:
        pdf.multi_cell(
            h=5.0,
            w=0,
            txt=
            "     2. Donation Columns: The uploaded donor file has an imbalanced dataset to"
            " show donation column(s). More than")
        pdf.ln(0.25)
        pdf.multi_cell(
            h=5.0,
            w=0,
            txt=
            "         98% of your sample belongs to one class (0 or 1 Target Value) that make up a "
            "large proportion of the data.")
        pdf.ln(0.5)
    else:
        pdf.multi_cell(h=5.0, w=0, txt="     2. Donation Columns:")
        pdf.ln(0.5)
        for i in range(len(donation_columns)):
            pdf.multi_cell(h=5.0,
                           w=0,
                           txt="              {}. {}".format(
                               test_list[i], donation_columns[i]))
            pdf.ln(0.3)
        pdf.ln(0.5)

    if len(cat_col) > len(test_list):
        cat_col = random.sample(cat_col, len(test_list))
    if len(cat_col) != 0:
        pdf.multi_cell(h=5.0, w=0, txt="     3. Categorical Columns:")
        pdf.ln(0.5)
        for i in range(len(cat_col)):
            pdf.multi_cell(h=5.0,
                           w=0,
                           txt="              {}. {}".format(
                               test_list[i], cat_col[i]))
            pdf.ln(0.3)
    else:
        pdf.multi_cell(
            h=5.0,
            w=0,
            txt=
            "     3. Categorical Columns: No categorical columns identified on the uploaded "
            "donor file.")
        pdf.ln(0.5)

    print_steps_taken()
    pdf.set_font(font_style, 'BU', size=10)
    pdf.multi_cell(h=7.5,
                   w=0,
                   txt="C. Important Terms Used in Predictive Modeling")
    pdf.set_font(font_style, size=10)
    pdf.ln(1)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt="     1. F1-score: It is a harmonic mean of precision and recall.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     2. Precision: It is a fraction of correctly classified instances among all "
        "predicted instances.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     3. Recall: It is a fraction of correctly classified instances among all "
        "actual/valid instances.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt="     4. Support: Number of samples used for the experiment.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     5. Confusion Matrix Plot: It is a plot of the true count (x-axis) versus "
        "predicted count (y-axis) for both the classes")
    pdf.ln(0.25)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "         (donor and non-donor). The top left box represents the count of true "
        "negatives, the top right box represents the")
    pdf.ln(0.25)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "         count of false negatives, bottom left box represents the count of false "
        "positives and bottom right box represents")
    pdf.ln(0.25)
    pdf.multi_cell(h=5.0, w=0, txt="         the count of true positives.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     6. Feature Importance Plot: Y-axis: feature present in input file and "
        "X-axis: relative % of feature importance.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     7. Correlation Plot: Correlation explains how one or more variables are "
        "related to each other.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     8. Probability Score: It is a probabilty (likelihood) of an individual to "
        "donate.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     9. Threshold Value: It is the threshold (cut-off) value used on a probability "
        "score to seperate a donor from a")
    pdf.ln(0.25)
    pdf.multi_cell(h=5.0, w=0, txt="         non-donor.")
    pdf.ln(0.5)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "     10. Predicted Classification (0 and 1): Classification value 1 indicates an "
        "individual likely to donate and classification")
    pdf.ln(0.25)
    pdf.multi_cell(
        h=5.0,
        w=0,
        txt=
        "         value 0 indicates an individual less likely to donate. They follow the "
        "threshold (cut-off) value logic.")
    pdf.ln(0.5)
    pdf.ln(3)

    plt.figure(figsize=(15, 10))
    model_f1_score = {}
    classification_full_pred = {}
    classification_full_pred_prob = {}
    feature_importance_dict = {}
    roc_fpr = {}
    roc_tpr = {}
    roc_auc = {}
    y_test_dict = {}
    y_pred_dict = {}
    for ind, m in enumerate(models):
        start_time = time.time()
        model = m['model']
        if m['label'] in [
                'PassiveAggressiveClassifier', 'LinearSVC', 'RidgeClassifier'
        ]:
            model = CalibratedClassifierCV(model)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        classification_full_pred[m['label']] = model.predict(X_pred)
        classification_full_pred_prob[m['label']] = model.predict_proba(X_pred)

        print("Classifier: {} and time(seconds): {}".format(
            m['label'], round(time.time() - start_time, 3)))
        print()
        model_f1_score[m['label']] = round(
            f1_score(y_test, y_pred, average='weighted'), 2)
        y_test_dict[m['label']] = y_test
        y_pred_dict[m['label']] = y_pred
        if m['label'] in ['DecisionTreeClassifier', 'RandomForestClassifier']:
            feature_value = model.feature_importances_[:-1]
        elif m['label'] in [
                'PassiveAggressiveClassifier', 'LinearSVC', 'RidgeClassifier'
        ]:
            model = m['model']
            model.fit(X_train, y_train)
            feature_value = model.coef_[0][:-1]
        elif m['label'] in ['GaussianNB']:
            continue
        else:
            feature_value = model.coef_[0][:-1]

        feature_importance_dict[m['label']] = feature_value

        fpr, tpr, auc = calculate_fpr_tpr(model, y_test, y_pred, X_test)
        roc_fpr[m['label']] = fpr
        roc_tpr[m['label']] = tpr
        roc_auc[m['label']] = auc

    return model_f1_score, classification_full_pred, classification_full_pred_prob, feature_importance_dict, roc_fpr, \
           roc_tpr, roc_auc, y_test_dict, y_pred_dict
        f = list(all_features - {'<BIAS>'})[0]
        flt_res = get_res(x, feature_filter=lambda name, _: name != f)
        flt_features = get_all(flt_res.targets[0].feature_weights)
        assert flt_features == (all_features - {f})
        return True
    return False


@pytest.mark.parametrize(['clf'], [
    [LogisticRegression(random_state=42)],
    [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')],
    [LogisticRegression(random_state=42, fit_intercept=False)],
    [LogisticRegressionCV(random_state=42)],
    [SGDClassifier(**SGD_KWARGS)],
    [SGDClassifier(loss='log', **SGD_KWARGS)],
    [PassiveAggressiveClassifier(random_state=42)],
    [Perceptron(random_state=42)],
    [RidgeClassifier(random_state=42)],
    [RidgeClassifierCV()],
    [LinearSVC(random_state=42)],
    [OneVsRestClassifier(LogisticRegression(random_state=42))],
])
def test_explain_linear(newsgroups_train, clf):
    assert_multiclass_linear_classifier_explained(newsgroups_train, clf,
                                                  explain_prediction)
    if isinstance(clf, OneVsRestClassifier):
        assert_multiclass_linear_classifier_explained(
            newsgroups_train, clf, explain_prediction_sklearn)


@pytest.mark.parametrize(['clf'], [
Exemplo n.º 57
0
def get_base_model():
    return {
        'passive_aggressive_classifier':
        PassiveAggressiveClassifier(max_iter=1000, tol=1e-3)
    }
Exemplo n.º 58
0
def create_models(headlines):
    headline = headlines['headline']
    label = headlines['label']
    headlines.loc[headlines['label'] == -1, 'label'] = 0
    arr_Accu = []

    #Random State apo edw ****************************

    # for i in range(1, 20):
    #     headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.01, random_state=i)
    #     # vect = CountVectorizer(max_features=100000, binary=True)
    #     vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}', ngram_range=(1, 3))
    #     headline_train_vector = vect.fit_transform(headline_train)
    #     headline_test_vector = vect.transform(headline_test)
    #
    #     # Note: Egine prospatheia balancing tou dataset alla to accuracy sti sunexeia twn dokimwn apo katw den veltiwthike
    #     # balancing = SMOTE()
    #     # headline_train_balanced, label_train_balanced = balancing.fit_sample(headline_train_vector, label_train)
    #     # oversampled_headlines, counts = np.unique(label_train_balanced, return_counts=True)
    #     # print(list(zip(oversampled_headlines, counts)))
    #     print("pre-Dummy")
    #     dummy = DummyClassifier()
    #     print("post-Dummy")
    #     dummy.fit(headline_train_vector, label_train)
    #     prediction = dummy.predict(headline_test_vector)
    #     accuracy = metrics.accuracy_score(label_test, prediction)
    #     print("Dummy Classifier: ")
    #     print(accuracy)
    #     arr_Accu.append(accuracy)
    # print(max(arr_Accu))
    # max_random_state = arr_Accu.index(max(arr_Accu)) + 1
    # print(max_random_state)
    # for j in range(1, 20):
    #     print("Random State : ", j, "   Accuracy : ", arr_Accu[j-1])

    # Random State mexri edw ********************************

    # Dokimi me k-fold gia tin euresi katalilis timis K gia megisto accuracy
    # Note: to accuracy edw einai xeirotero apo prin

    # arr_Accu = []
    # for i in range(3, 15):
    #     vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8)
    #     headline_train_vector = vect.fit_transform(headline)
    #
    #     dummy = DummyClassifier()
    #     accuracy = cross_val_score(dummy, headline_train_vector, label, cv=i, scoring='accuracy')
    #
    #     arr_Accu.append(np.mean(accuracy))
    #
    # # print(arr_Accu)
    # for j in range(3, 15):
    #     print("K-Fold : ", j, "   Accuracy : ", arr_Accu[j - 3])

    # Ksekina i dimiourgia montelwn me to veltisto random state
    # print("random state chosen: ")
    # print(max_random_state)
    # headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.20, random_state=max_random_state)

    x = headlines['headline']
    y = headlines['label']
    print("Headlines", x.shape)
    print("Labels", y.shape)

    neg = sum(headlines.label == 0)
    pos = sum(headlines.label == 1)
    print("Neg", neg)
    print("Pos", pos)
    diff = abs(pos - neg)
    print("Class difference: ", diff)

    df_filter = headlines[headlines.label == 0]

    run_stats = pd.DataFrame()
    print(headlines.head())
    from ekphrasis.classes.spellcorrect import SpellCorrector

    # Dokimastiko pre-processing twn tweets
    @lru_cache(maxsize=50000)
    def tokenization(text):
        text = re.split('\W+', text)
        return text

    headlines['headline'] = headlines['headline'].apply(
        lambda x: tokenization(x.lower()))

    print(headlines.head())

    stopword = nltk.corpus.stopwords.words('english')

    # @lru_cache(maxsize=50000)
    def remove_stopwords(text):
        return [word for word in text if word not in stopword]

    headlines['headline'] = headlines['headline'].apply(remove_stopwords)
    print(headlines.head())

    # stemmer = nltk.PorterStemmer()
    # def stemming(text):
    #     text = [stemmer.stem(word) for word in text]
    #     return text
    #
    # headlines['headline'] = headlines['headline'].apply(lambda x: stemming(x))
    # print(headlines.head(10))

    # Spell Correction, ισως να μην χρησιμοποιηθεί γιατί απαιτεί υπερβολικά πολλή RAM και χρόνο ***
    sp = SpellCorrector(corpus="english")

    def spell_corrector(text):
        print("**Text before correction: ", text)
        text = [sp.correct(word) for word in text]
        print(">>Text after correction:", text)
        return text

    # print("Spelling Correction")
    # headlines['headline'] = headlines['headline'].apply(lambda x: spell_corrector(x))

    # headlines['headline'] = headlines['headline'].apply(spell_corrector)

    lm = nltk.WordNetLemmatizer()

    def lemmatizer(text):
        return [lm.lemmatize(word) for word in text]

    print("Lemmatizer")
    headlines['headline'] = headlines['headline'].apply(lemmatizer)
    print(headlines.head(10))

    headlines['headline'] = headlines['headline'].str.join(" ")
    print(headlines.head())

    headline_train, headline_test, label_train, label_test = train_test_split(
        headline, label, test_size=.02)
    x_validation, x_test, y_validation, y_test = train_test_split(
        headline_test, label_test, test_size=.5)
    print(headline_train.shape)
    print(headline_test.shape)

    # vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}',  ngram_range=(1, 3))
    vect = TfidfVectorizer(ngram_range=(1, 3))

    # Grid Searching gia veltisto apotelesma POLY XRONOVORO
    # lr = LogisticRegression()
    # text_clf = Pipeline([
    #         ('vect', CountVectorizer()),
    #         ('tfidf', TfidfTransformer()),
    #         ('clf', LogisticRegression())])
    # params = {
    #     'clf__penalty': ['l1', 'l2'],  # l1 is Lasso, l2 is Ridge
    #     'clf__solver': ['liblinear'],
    #     'clf__C': np.linspace(0.00002, 1, 10)
    # }
    # lr_gs = GridSearchCV(text_clf, params, cv=5, iid=False).fit(headline_train[:200000], label_train[:200000])
    # print("Best Params", lr_gs.best_params_)
    # print("Best Score", lr_gs.best_score_)
    # Mexri edw

    log_regression = LogisticRegression(C=1.0,
                                        class_weight="balanced",
                                        solver="liblinear",
                                        multi_class="ovr",
                                        verbose=100,
                                        random_state=42)
    linear_SVC = LinearSVC(C=0.1, verbose=100, random_state=42)
    passive_aggressive = PassiveAggressiveClassifier()
    multinomial_bayes = MultinomialNB(alpha=10)
    complementNB = ComplementNB()
    ridge_clas = RidgeClassifier(solver='lsqr', random_state=42)
    naive_bayes = BernoulliNB()
    random_forest = RandomForestClassifier(max_depth=30,
                                           n_estimators=4000,
                                           verbose=100,
                                           n_jobs=2)
    svm = SVC(gamma=0.5, C=100, kernel="linear", verbose=100)

    # Edw dokimazw me GridSearch gia ta kalutero parameter tuning
    parameters = {
        'classifier__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
        'classifier__max_iter': [1000],
        'classifier__solver': ['lsqr'],
        'classifier__random_state': [42]
    }

    # EDW KANW GRIDSEARCH

    # pipe = Pipeline([
    #     ('vectorizer', TfidfVectorizer(max_features=100000)),
    #     ('classifier', ridge_clas)
    # ])
    # grid = GridSearchCV(pipe, n_jobs=2, cv=5, verbose=3, param_grid=parameters)
    #
    # start_time = time.time()
    # grid.fit(headline_train, label_train)
    # end_time = time.time()
    # print('Total fit time: {}'.format(end_time - start_time))
    #
    # prediction = grid.predict(label_test)
    # print("Prediction Finished")
    # res = pd.DataFrame({'Prediction ': prediction})
    # print(res)

    # MEXRI EDW

    # algorithms = [log_regression, complementNB,  linear_SVC,  passive_aggressive, multinomial_bayes, naive_bayes, ridge_clas]
    # algo_names = ["Logistic Regression", "Complement Naive Bayes",  "Linear SVC", "Passive Aggressive", "Mutlinomial Bayes", "Naive Bayes", "Ridge Classifier"]
    # algo_name_pair = zip(algorithms, algo_names)

    algorithms = [ridge_clas]
    algo_names = ["Ridge Classifier"]
    algo_name_pair = zip(algorithms, algo_names)

    results = dict()
    for algo, name in algo_name_pair:
        ug_pipeline = Pipeline([('vectorizer', vect), ('classifier', algo)])
        print("Classifier : ", algo)
        results[name] = train_test_and_evaluate(ug_pipeline, headline_train,
                                                label_train, x_validation,
                                                y_validation)

    dframe = pd.DataFrame.from_dict(results, orient="index").reset_index()
    dframe.columns = ["classifier", "prediction"]
    dframe.sort_values(by=["prediction"], ascending=False)
    print(results)
    sns.barplot(x='classifier', y='prediction', data=dframe)
    plt.title("TFidf Vectorizer, n-gram=3")
    fig = plt.gcf()
    fig.set_size_inches(20, 10)
    plt.show()

    # ta headlines tou training kommatioy ginontai fit_transform gia to fit
    # ta headlines tou test ginontai transform gia to test

    # Multionomial Bayes
    # mbayes = MultinomialNB()
    # start_time = time.time()
    # mbayes.fit(headline_train_vector, label_train)
    # runtime = time.time() - start_time
    # # print(mbayes.score(headline_train_vector, label_train))
    #
    # # actual testing me to testing set pou diaxwrisame
    # prediction = mbayes.predict(headline_test_vector)
    # # print(prediction)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('MBayes Accuracy : ', accuracy)
    # run_stats = run_stats.append({'Classifier': 'Multinomial Naive Bayes', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True)
    # results["bayes_accuracy"] = prediction

    # start_time = time.time()
    # log_regression = LogisticRegression()
    # log_regression.fit(headline_train_vector, label_train)
    # prediction = log_regression.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # runtime = time.time() - start_time
    # print('LogisticRegression Accuracy : ', accuracy)
    # print('Runtime : ', runtime)
    # results["Logistic_regression"] = accuracy
    # Teleutaia fora 0.77838

    # decision_tree = DecisionTreeClassifier(criterion='entropy')
    # decision_tree.fit(headline_train_vector, label_train)
    # prediction = decision_tree.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('DecisionTree Accuracy : ', accuracy)
    # #
    # random_forest = RandomForestClassifier(criterion='entropy')
    # random_forest.fit(headline_train_vector, label_train)
    # prediction = random_forest.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('RandomForestClassifier Accuracy : ', accuracy)
    # Teleutaia fora, DEN ETREKSE, PIRE POLY WRA KAI TO EKLEISA
    #
    # adaboost = AdaBoostClassifier()
    # adaboost.fit(headline_train_vector, label_train)
    # prediction = adaboost.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('Adaboost Accuracy : ', accuracy)
    #  Teleutaio accuracy 0.66687
    #
    # bernoulli_bayes = BernoulliNB()
    # start_time = time.time()
    # bernoulli_bayes.fit(headline_train_vector, label_train)
    # runtime = time.time() - start_time
    # prediction = bernoulli_bayes.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('BernoulliNB Accuracy : ', accuracy)
    # run_stats = run_stats.append({'Classifier': 'Bernoulli', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True)

    # linear_SVC = LinearSVC()
    # start_time = time.time()
    # linear_SVC.fit(headline_train_vector, label_train)
    # runtime = time.time() - start_time
    # prediction = linear_SVC.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('Linear_SVC Accuracy : ', accuracy)
    # print("Runtime : ", runtime)
    # run_stats = run_stats.append({'Classifier': 'Linear SVC', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True)
    # Teleutaio accuracy 0.7761956

    # passive_aggressive = PassiveAggressiveClassifier()
    # passive_aggressive.fit(headline_train_vector, label_train)
    # prediction = passive_aggressive.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('PassiveAggressiveClassifier Accuracy : ', accuracy)
    pprint(run_stats)

    return results
Exemplo n.º 59
0
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


clf = MultinomialNB() 
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


# Testing
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(n_iter=50)
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])



clf = MultinomialNB(alpha=0.1)
last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
Exemplo n.º 60
0
test_ratio = .2
val_ratio = .15
batch_size = 256
validate_every = 50
print_every = batch_size * 2

loss = ['hinge', 'log']
alpha = [.000001, .00001, .0001, .001]
l1_ratio = [0., .1, .2, .3, .4, .5, .6, .7, .8, .9]

partial_fit_classifiers = {
    'SGD-SVM': SGDClassifier(random_state=random_state, loss='hinge'),
    'SGD-Log': SGDClassifier(random_state=random_state, loss='log'),
    'Perceptron': Perceptron(random_state=random_state),
    'NB Multinomial': MultinomialNB(alpha=0.01),
    'Passive-Aggressive': PassiveAggressiveClassifier(random_state=random_state)
}


def get_batchnames(split_val=True):
    """
    shuffle train and test set then split train set further to train set and validation set
    :return: pickle filenames of train/validation/test batches
    """
    np.random.seed(random_state)
    test_batch_names = glob.glob(os.path.join('dataset/batches/test_batches', '*.pickle'))
    test_batch_names = np.random.choice(test_batch_names, len(test_batch_names), replace=False)
    train_batch_names = glob.glob(os.path.join('dataset/batches/train_batches', '*.pickle'))
    train_batch_names = np.random.choice(train_batch_names, len(train_batch_names), replace=False)

    if split_val: