def test_classifier_refit():
    # Classifier can be retrained on different labels and features.
    clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))

    clf.fit(X[:, :-1], iris.target_names[y])
    assert_array_equal(clf.classes_, iris.target_names)
Exemplo n.º 2
0
def constructPickles(filename):
    dataDF = pd.read_csv(filename)

    labels = dataDF.label

    # DataFlair - Split the dataset
    x_train, x_test, y_train, y_test = train_test_split(dataDF['text'],
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=7)

    # DataFlair - Initialize a TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

    # DataFlair - Fit and transform train set, transform test set
    tfidf_train = tfidf_vectorizer.fit_transform(x_train)
    tfidf_test = tfidf_vectorizer.transform(x_test)

    # DataFlair - Initialize a PassiveAggressiveClassifier
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    # DataFlair - Predict on the test set and calculate accuracy
    pac = joblib.load("testPickle")
    tfidf_vectorizer = joblib.load("testPickleVector")

    tfidf_test = tfidf_vectorizer.transform(x_test)

    y_pred = pac.predict(tfidf_test)
    score = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {round(score * 100, 2)}%')

    joblib.dump(pac, "testPickle")
    joblib.dump(tfidf_vectorizer, "testPickleVector")
def train_and_predict_m7 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Passive-Aggressive Classifer...")
    clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]}
    #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
def test_classifier_refit():
    # Classifier can be retrained on different labels and features.
    clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))

    clf.fit(X[:, :-1], iris.target_names[y])
    assert_array_equal(clf.classes_, iris.target_names)
Exemplo n.º 5
0
def train(tfidf_train, y_train, tfidf_test):
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    y_pred = pac.predict(tfidf_test)

    return y_pred
Exemplo n.º 6
0
def training():
    X, y = get_data()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=7)

    tfidf_Xtrain, tfidf_Xtest = Vectorize(X_train, X_test)

    Pac = PassiveAggressiveClassifier(C=0.5, random_state=5)

    Pac.fit(tfidf_Xtrain, y_train)

    Pac_acc = Pac.score(tfidf_Xtest, y_test)

    print(Pac_acc)

    y_pred = Pac.predict(tfidf_Xtest)

    Pac_accuracy = accuracy_score(y_test, y_pred)

    print(Pac_accuracy)

    conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

    print(conf_matrix)

    clf_report = classification_report(y_test, y_pred)

    print(clf_report)

    makePickleFile(Pac)
Exemplo n.º 7
0
def train_models(train_X, train_y, test_X=None, test_y=None,
                 model_prefix='./model/temp'):
    # Train {{{.
    logging.info('Training NB model ...')
    nb_model = MultinomialNB(alpha=0.01)
    nb_model.fit(train_X, train_y)

    logging.info('Training SVM model ...')
    svm_model = LinearSVC(random_state=1)
    svm_model.fit(train_X, train_y)

    logging.info('Training PA model ...')
    pa_model = PassiveAggressiveClassifier()
    pa_model.fit(train_X, train_y)
    # }}}.

    # Test {{{.
    if test_X is not None and test_y is not None:
        logging.info('Evaluating on test set ...')
        test_y = [l.replace('__label__', '') for l in test_y]
        for model, desp in zip([nb_model, pa_model, svm_model],
                               ['NB_Report', 'PA_Report', 'SVM_report']):
            print >>sys.stderr, (
                '================== %s ==================\n' % desp)
            pred_y = model.predict(test_X)
            pred_y = [l.replace('__label__', '') for l in pred_y]
            print >>sys.stderr, classification_report(test_y, pred_y, digits=4)
    # }}}.

    # Save models {{{.
    joblib.dump(nb_model, model_prefix + '.nb', compress=True)
    joblib.dump(svm_model, model_prefix + '.svm', compress=True)
    joblib.dump(pa_model, model_prefix + '.pa', compress=True)
Exemplo n.º 8
0
def Predict():
    #user_inputเข้ามา
    user_input = request.form['text']
    #อ่านไฟล์dataset
    df = pd.read_csv('train.csv')
    #กำหนดค่าจากตัวเลขให้เป็นtext
    conversion_dict = {0: 'HQ', 1: 'LQ_EDIT', 2: 'LQ_CLOSE'}
    df['Body'] = df['Y'].replace(conversion_dict)
    # print(df.label.value_counts())

    # Train test split
    #แบ่งข้อมูลมาtrain
    x_train, x_test, y_train, y_test = train_test_split(df['Body'],
                                                        df['Y'],
                                                        test_size=0.25,
                                                        random_state=7,
                                                        shuffle=True)
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75)

    # แปลงเป็นตัวเลข
    vec_train = tfidf_vectorizer.fit_transform(x_train.values.astype('U'))
    vec_test = tfidf_vectorizer.transform(x_test.values.astype('U'))

    # Train Model
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(vec_train, y_train)
    model = MultinomialNB()
    model.fit(vec_train, y_train)

    # Predict
    user_input_tranform = tfidf_vectorizer.transform([user_input])
    y_predict = pac.predict(user_input_tranform)

    return render_template("Predict.html", text=user_input, predict=y_predict)
Exemplo n.º 9
0
def model_PassiveAggressive(train_x, train_y, test_x, test_y, n_est=100):
    model = PassiveAggressiveClassifier()
    model.fit(train_x, train_y)
    sc = model.score(test_x, test_y)
    prediction = model.predict(test_x)
    mae = mean_absolute_error(test_y, prediction)
    return (sc, mae, prediction, model)
Exemplo n.º 10
0
def linear_models(x_train, y_train):
    from sklearn.linear_model import LogisticRegression
    classifier1 = LogisticRegression(C=1.2, random_state=0, max_iter=1500)
    classifier1.fit(x_train, y_train)

    from sklearn.linear_model import PassiveAggressiveClassifier
    classifier2 = PassiveAggressiveClassifier()
    classifier2.fit(x_train, y_train)

    from sklearn.linear_model import RidgeClassifierCV
    classifier3 = RidgeClassifierCV()
    classifier3.fit(x_train, y_train)

    from sklearn.linear_model import SGDClassifier
    classifier4 = SGDClassifier()
    classifier4.fit(x_train, y_train)

    from sklearn.linear_model import Perceptron
    classifier5 = Perceptron()
    classifier5.fit(x_train, y_train)

    print('LogisticRegression training accuracy: ',
          classifier1.score(x_train, y_train))
    print('PassiveAggressiveClassifier training accuracy: ',
          classifier2.score(x_train, y_train))
    print('RidgeClassifierCV training accuracy: ',
          classifier3.score(x_train, y_train))
    print('SGDClassifier training accuracy: ',
          classifier4.score(x_train, y_train))
    print('Perceptron training accuracy: ',
          classifier5.score(x_train, y_train))

    return classifier1, classifier2, classifier3, classifier4, classifier5
Exemplo n.º 11
0
 def test_main(self):
     categories, documents = get_docs_categories()
     clean_function = lambda text: '' if text.startswith('[') else text
     entity_types = set(['GPE'])
     term_doc_mat = (TermDocMatrixFactory(
         category_text_iter=zip(categories, documents),
         clean_function=clean_function,
         nlp=_testing_nlp,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).build())
     clf = PassiveAggressiveClassifier(n_iter=5,
                                       C=0.5,
                                       n_jobs=-1,
                                       random_state=0)
     fdc = FeatsFromDoc(
         term_doc_mat._term_idx_store,
         clean_function=clean_function,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
     tfidf = TfidfTransformer(norm='l1')
     X = tfidf.fit_transform(term_doc_mat._X)
     clf.fit(X, term_doc_mat._y)
     X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
     pred = clf.predict(tfidf.transform(X_to_predict))
     dec = clf.decision_function(X_to_predict)
Exemplo n.º 12
0
def PassiveAggressiveClassifier_1(train_predictors,test_predictors,train_target,test_target):
    clf = PassiveAggressiveClassifier()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Linear Model PassiveAggressiveClassifier: "+str(accuracy)
    return accuracy,predicted 
Exemplo n.º 13
0
def get_delay():
    result = request.form
    query_title = result['title']
    query_text = result['maintext']
    # print(query_text)
    query = get_all_query(query_title, query_text)
    # query = remove_punctuation_stopwords_lemma(query_text)
    # print(query)
    # user_input = {'query':query}
    toSearch = query_title
    query_text = [query_text]
    query_title = [query_title]
    tfidf_test_input = tfidf_vectorizer.transform(query)

    linear_clf = PassiveAggressiveClassifier()

    linear_clf.fit(tfidf_train, y_train)
    pred = linear_clf.predict(tfidf_test_input)
    print(pred)

    try:
        from googlesearch import search
    except ImportError:
        print("No module named 'google' found")

    # to search
    links = []
    for j in search(toSearch, tld="co.in", num=10, stop=10, pause=2):
        links.append(j)


# <style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style>
# return f'<html><style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style><body><h1>It is a {pred[0]} news. </h1><h2>You may refer to the following articles for more details.</h2> <a href={links[0]}> Article 1 </a><br> <a href={links[1]}> Article 2 </a>  <form action="/"> <button type="submit">back </button> </form></body></html>'
    return render_template('result.html', links=links, pred=pred[0])
Exemplo n.º 14
0
def passiveAgressive(train_x, train_y, test_x):
    from sklearn.linear_model import PassiveAggressiveClassifier
    # apply Linear Regression:
    model = PassiveAggressiveClassifier()
    model.fit(train_x, train_y)
    y_prediction = model.predict(test_x)
    return y_prediction
def test_wrong_class_weight_label():
    # ValueError due to wrong class_weight label.
    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
    y2 = [1, 1, 1, -1, -1]

    clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100)
    with pytest.raises(ValueError):
        clf.fit(X2, y2)
Exemplo n.º 16
0
 def train_passve_aggresive_classifier(self, tfidf_train, b_train,
                                       tfidf_test, b_test):
     pclass = PassiveAggressiveClassifier(max_iter=60)
     pclass.fit(tfidf_train, b_train)
     b_pred = pclass.predict(tfidf_test)
     factcheckscore = accuracy_score(b_test, b_pred)
     print(f"Accuracy Is {round(factcheckscore*100,2)}%")
     return self.save_model(pclass)
Exemplo n.º 17
0
def passive_aggressive(sample_data, test_percentage):
    """ Implement Naive Bayes and perform accuracy_tests """
    # Bag of words for first x words
    X_train, X_test, y_train, y_test = data_models.split_test_train_data(sample_data, test_percentage)
    linear_clf = PassiveAggressiveClassifier()
    linear_clf.fit(X_train, y_train)
    pred = linear_clf.predict(X_test)
    return (y_test == pred).sum() * 100 / len(y_test)
Exemplo n.º 18
0
def passiveAggresive(train, test, Y_train, Y_test, column):
    '''
    Fits a Passive Aggresive Perceptron Classifer
    '''
    clf = PassiveAggressiveClassifier(C = .1, max_iter = 1000, class_weight = 'balanced', tol = 1e-3)
    clf.fit(train, Y_train[column])
    clf.predict(test)
    return clf.score(test, Y_test[column])
def paclassifier(train_X, train_Y, test_X):
    print("Training model.....")
    pac = PassiveAggressiveClassifier(
        random_state=0)  # Training the Passive Aggressive Classifier Model
    pac.fit(train_X, train_Y)  # Fitting the training data into the model
    y_pred = pac.predict(
        test_X)  # Predicting the label output for the test data
    return y_pred
Exemplo n.º 20
0
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            clf = PassiveAggressiveClassifier(C=1.0, n_iter=30,
                                              fit_intercept=fit_intercept,
                                              random_state=0)
            clf.fit(data, y)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
Exemplo n.º 21
0
def featureSelection() :
	
	#load Dataset
	X_0, y, biomarkerNames = loadDataset()\
	#use K-Fold
	kf = KFold(n_splits=10)
	kf.get_n_splits(X_0)
	
	for i in (250,500,1000):
		print("Number of Features "+str(i))
		fold=0
		for train_index, test_index in kf.split(X_0):
			print("Fold "+str(fold))
			fold=fold+1
			#declare selector with 4 features using F-score
			selector=SelectKBest(f_classif, k=i)
			#Normalize Data
			scaler = StandardScaler()
			X_train, X_test = X_0[train_index], X_0[test_index]
			y_train, y_test = y[train_index], y[test_index]
			X_train = scaler.fit_transform(X_train)
			X_test=scaler.transform(X_test)
			#Calculate Scores
			X_train = selector.fit_transform(X_train, y_train)
			#Get positions of Best Scores
			selected=selector.get_support(indices=True)
			X_test=selector.transform(X_test)
			##Print ANOVA F-Values
			#print("ANOVA F-value")
			#print(selector.scores_[selected])
			##Print P-values
			#print("p values")
			#print(selector.pvalues_[selected])
			#Print Resulting FeaturesS
			#print("features names")
			#print(biomarkerNames[selected])
			#print("features index")
			##Print Features Index
			#print(selected)
			#Declare Classifier
			clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
			#Train Classifier
			clf.fit(X_train, y_train)
			#Print Accuracy
			accuracy_train=clf.score(X_train,y_train)
			accuracy_test=clf.score(X_test,y_test)
			print("Accuracy Train " + str(accuracy_train))
			print("Accuracy Test " + str(accuracy_test))
			## create folder
			#folderName ="./results/"
			#if not os.path.exists(folderName) : os.makedirs(folderName)
			##Print reduce Dataset
			#pd.DataFrame(X_new).to_csv(folderName+"data_"+str(0)+".csv", header=None, index =None)
			#pd.DataFrame(biomarkerNames[selected]).to_csv(folderName+"features_"+str(0)+".csv", header=None, index =None)
			#pd.DataFrame(y).to_csv(folderName+"labels.csv", header=None, index =None)
		
	return 
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            clf = PassiveAggressiveClassifier(C=1.0, n_iter=30,
                                              fit_intercept=fit_intercept,
                                              random_state=0)
            clf.fit(data, y)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
Exemplo n.º 23
0
def paClassify(X, Y, Xt, Yt, class_weight):

    title = "Passive Aggressive Classifier"

    classifier = PassiveAggressiveClassifier(n_iter=10, class_weight=class_weight)
    classifier.fit(X, Y)
    YPredict = classifier.predict(Xt)

    printAccuracy(YPredict, Yt, title)
Exemplo n.º 24
0
def _passiveaggressiveclassifier(*,
                                 train,
                                 test,
                                 x_predict=None,
                                 metrics,
                                 C=1.0,
                                 fit_intercept=True,
                                 max_iter=1000,
                                 tol=0.001,
                                 early_stopping=False,
                                 validation_fraction=0.1,
                                 n_iter_no_change=5,
                                 shuffle=True,
                                 verbose=0,
                                 loss='hinge',
                                 n_jobs=None,
                                 random_state=None,
                                 warm_start=False,
                                 class_weight=None,
                                 average=False):
    """For for info visit :
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier
    """

    model = PassiveAggressiveClassifier(
        C=C,
        fit_intercept=fit_intercept,
        max_iter=max_iter,
        tol=tol,
        early_stopping=early_stopping,
        validation_fraction=validation_fraction,
        n_iter_no_change=n_iter_no_change,
        shuffle=shuffle,
        verbose=verbose,
        loss=loss,
        n_jobs=n_jobs,
        random_state=random_state,
        warm_start=warm_start,
        class_weight=class_weight,
        average=average)
    model.fit(train[0], train[1])
    model_name = 'PassiveAggressiveClassifier'
    y_hat = model.predict(test[0])

    if metrics == 'f1_score':
        accuracy = f1_score(test[1], y_hat)
    if metrics == 'jaccard_score':
        accuracy = jaccard_score(test[1], y_hat)
    if metrics == 'accuracy_score':
        accuracy = accuracy_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Exemplo n.º 25
0
class DeployedClassifierFactory:
    def __init__(self,
                 term_doc_matrix,
                 term_doc_matrix_factory,
                 category,
                 nlp=None):
        '''This is a class that enables one to train and save a classification model.

		Parameters
		----------
		term_doc_matrix : TermDocMatrix
		term_doc_matrix_factory : TermDocMatrixFactory
		category : str
			Category name
		nlp : spacy.en.English
		'''
        self._term_doc_matrix = term_doc_matrix
        self._term_doc_matrix_factory = term_doc_matrix_factory
        assert term_doc_matrix_factory._nlp is None
        assert term_doc_matrix_factory.category_text_iter is None
        self._category = category
        self._clf = None
        self._proba = None

    def passive_aggressive_train(self):
        '''Trains passive aggressive classifier

		'''
        self._clf = PassiveAggressiveClassifier(n_iter=50,
                                                C=0.2,
                                                n_jobs=-1,
                                                random_state=0)
        self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
        y_dist = self._clf.decision_function(self._term_doc_matrix._X)
        pos_ecdf = ECDF(y_dist[y_dist >= 0])
        neg_ecdf = ECDF(y_dist[y_dist <= 0])

        def proba_function(distance_from_hyperplane):
            if distance_from_hyperplane > 0:
                return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
            elif distance_from_hyperplane < 0:
                return pos_ecdf(distance_from_hyperplane) / 2.
            return 0.5

        self._proba = proba_function
        return self

    def build(self):
        '''Builds Depoyed Classifier
		'''
        if self._clf is None:
            raise NeedToTrainExceptionBeforeDeployingException()
        return DeployedClassifier(self._category,
                                  self._term_doc_matrix._category_idx_store,
                                  self._term_doc_matrix._term_idx_store,
                                  self._term_doc_matrix_factory)
Exemplo n.º 26
0
def train_online_model(xtr, ytr, model=None):
    # Train classifier
    t0 = time.time()
    if model is None:
        model = PassiveAggressiveClassifier()
        model.fit(xtr, ytr)
    else:
        model.partial_fit(xtr, ytr)
    print "Training took %.2f seconds" % (time.time()-t0)
    return model
Exemplo n.º 27
0
def train_online_model(xtr, ytr, model=None):
    # Train classifier
    t0 = time.time()
    if model is None:
        model = PassiveAggressiveClassifier()
        model.fit(xtr, ytr)
    else:
        model.partial_fit(xtr, ytr)
    print "Training took %.2f seconds" % (time.time() - t0)
    return model
Exemplo n.º 28
0
def classify():
    # Initialize a PassiveAggressiveClassifier
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    # Predict on the test set and calculate accuracy
    y_pred_data = pac.predict(tfidf_test)
    score = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {round(score * 100, 2)}%')
    return y_pred_data
class BinaryClassifier(object):
    def __init__(self, classifier_type, scale_features=True):
        self.scale_features = scale_features
        self.classifier_type = classifier_type
        self.clear()
        self.train_std = 0
        self.random_gen = np.random.RandomState(136543785)

    def clear(self, remember_train_std_if_supported=False):
        self.positive_instances = []
        self.negative_instances = []
        # self.classifier = svm.SVC(kernel='linear')
        if self.classifier_type == CLASSIFIER_TYPE.LR:
            #print (CLASSIFIER_TYPE.LR)
            self.classifier = LogisticRegression(C=1.0)
        elif self.classifier_type == CLASSIFIER_TYPE.PA:
            #print (CLASSIFIER_TYPE.PA)
            self.classifier = PassiveAggressiveClassifier(loss='hinge', C=1.0)
        elif self.classifier_type == CLASSIFIER_TYPE.SVM:
            self.classifier = svm.SVC(kernel='linear')
        if not remember_train_std_if_supported:
            self.train_std = 0

    def add_positive_instances(self, positive_instances):
        self.positive_instances.extend(positive_instances)

    def add_negative_instances(self, negative_instances):
        self.negative_instances.extend(negative_instances)

    def train(self):
        X = self.positive_instances + self.negative_instances
        y = np.asarray([1] * len(self.positive_instances) +
                       [0] * len(self.negative_instances))

        # shuffling the train instances in case classifier is sensitive to this order
        Xy = list(zip(X, y))
        self.random_gen.shuffle(Xy)
        X[:], y[:] = zip(*Xy)

        X = mat_concat(X)

        if self.scale_features:
            if self.train_std == 0:
                self.train_std = (pointwise_mult(X, X).mean() -
                                  X.mean()**2)**0.5
            X = X / self.train_std
        # X = X/self.train_std
        self.classifier.fit(X, y)

    def predict(self, instances):
        # scaled_instances = [inst/self.train_std for inst in instances]
        instances = mat_concat(instances)
        if self.scale_features and self.train_std > 0:
            instances = instances / self.train_std
        return self.classifier.predict(instances)
Exemplo n.º 30
0
def test_passive_aggressive_2():
    """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0"""

    tpot_obj = TPOT()
    result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0)
    result = result[result['group'] == 'testing']

    pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42)
    pagg.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
Exemplo n.º 31
0
def test_passive_aggressive_2():
    """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0"""

    tpot_obj = TPOT()
    result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0)
    result = result[result['group'] == 'testing']

    pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42)
    pagg.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
Exemplo n.º 32
0
def train_model():

    #just using dummy data from a text
    article = extract("/home/david/2019-ca400-taland2/src/dataset/test.txt")
    dftrain = pd.read_csv(
        '/home/david/2019-ca400-taland2/src/dataset/train.csv')
    #drops rows that have null values
    dftrain = dftrain.dropna()
    #Set column names to variables
    df_x = dftrain['text']
    df_y = dftrain['label']

    #split training data
    x_train, x_test, y_train, y_test = train_test_split(df_x,
                                                        df_y,
                                                        test_size=0.33,
                                                        random_state=53)

    # cv = CountVectorizer(stop_words = 'english', max_features = 1000)
    # x_traincv = cv.fit_transform(x_train)
    # article_testcv = cv.transform(article)

    tfv = TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000)
    x_traintf = tfv.fit_transform(x_train)
    article_testtf = tfv.transform(article)
    tfv_test = tfv.transform(x_test)

    #tfv_df = pd.DataFrame(x_traintf.A, columns = tfv.get_feature_names())
    #print(tfv_df.head())

    #accuracy = 0.873

    # mnb_clf = MultinomialNB()
    # mnb_clf.fit(x_traintf, y_train)
    # pred = mnb_clf.predict(tfv_test)
    #
    #accuracy = 0.925

    pac = PassiveAggressiveClassifier(n_iter_no_change=5,
                                      max_iter=10,
                                      early_stopping=True)
    pac.fit(x_traintf, y_train)
    pred = pac.predict(article_testtf)
    accuracy = metrics.accuracy_score(y_test, pred)

    #pred = .predict(tfv_test)
    #pred = mnb_clf.predict(article_testtf)
    #
    # if pred == [0]:
    #     print("This news article is reliable")
    # else:
    #     print("This news article is deemed unreliable")

    print("MultinomialNB accuracy:   %0.3f" % accuracy)
def get_baseline_pa(dataset, train_label_list, test_label_list, verbose=True):
    (X_train, Y_train), (X_test, Y_test) = dataset
    classifier = PassiveAggressiveClassifier(n_jobs=-1, fit_intercept=True)
    classifier.fit(X_train, train_label_list)
    accuracy = classifier.score(X_test, test_label_list)

    if verbose:
        print('Got baseline of %f with Passive Aggressive classifier' %
              accuracy)

    return accuracy
def mainworker(limit1,limit2):
	N=10
	l=[]
	w1=[] # +1 class
	w2=[]#-1 class
	temp=[]
	classlist=[]
	f=open("pdata.txt")
	for line in f:
        	x=(line.strip("\n")).split(",")
        	temp=[]
        	for i in xrange(len(x)):
			x[i]=int(x[i])
			temp.append(x[i])
        	clas=temp.pop()
		temp=temp[:limit1]+temp[limit2+1:]
        	l.append(temp)
       		classlist.append(clas)
       		"""if(temp[-1]==-1):
                	w2.append(temp)
       		else:
                	w1.append(temp)"""
	f.close()
	X=np.array(l)
	y=np.array(classlist)

	X=np.array(l)
	y=np.array(classlist)
	karray=[2,3,4,5]
	for k in karray:
		kf = cross_validation.KFold(11054, n_folds=k)
		averager=[]
		for train_index,test_index in kf:
		#print("TRAIN:", train_index, "TEST:", test_index)
	   		X_train, X_test = X[train_index], X[test_index]
	   		y_train, y_test = y[train_index], y[test_index]
		#print X_train, len(X_test), len(y_train), len(y_test)
			train_data=[]
	        	test_data=[]
        		train_label=[]
       			test_label=[]
			X1 = X_train#train_data
			Y1 = y_train#train_label	
			clf = PassiveAggressiveClassifier()
			#clf = svm.SVC(kernel='linear')
			clf.fit(X1,Y1)
			Z = X_test#test_data
			predicted = clf.predict(Z)
			accuracy = getAccuracy(predicted, y_test)#test_label)
			averager.append(accuracy)
		answer=np.mean(averager)
		print "The mean for",k,"fold is:"
		print answer
def test_wrong_class_weight_format():
    # ValueError due to wrong class_weight argument type.
    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0],
                   [1.0, 0.0]])
    y2 = [1, 1, 1, -1, -1]

    clf = PassiveAggressiveClassifier(class_weight=[0.5], max_iter=100)
    with pytest.raises(ValueError):
        clf.fit(X2, y2)

    clf = PassiveAggressiveClassifier(class_weight="the larch", max_iter=100)
    with pytest.raises(ValueError):
        clf.fit(X2, y2)
Exemplo n.º 36
0
def PassiveAggressive_clf(training_set_np, validation_set_np, testing_set_np,
                          training_label, validation_label, testing_label):
    clf = PassiveAggressiveClassifier(max_iter=50)
    clf.fit(training_set_np, training_label)

    print("Passive Aggressive Classifier")
    print("Training Set Accuracy  : " +
          str(100 * clf.score(training_set_np, training_label)))
    print("Validation Set Accuracy: " +
          str(100 * clf.score(validation_set_np, validation_label)))
    print("Testing Set Accuracy   : " +
          str(100 * clf.score(testing_set_np, testing_label)))
    print("\n")
Exemplo n.º 37
0
def test_classifier_correctness(loss):
    y_bin = y.copy()
    y_bin[y != 1] = -1

    clf1 = MyPassiveAggressive(loss=loss, n_iter=2)
    clf1.fit(X, y_bin)

    for data in (X, X_csr):
        clf2 = PassiveAggressiveClassifier(loss=loss, max_iter=2,
            shuffle=False, tol=None)
        clf2.fit(data, y_bin)

        assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
Exemplo n.º 38
0
def TrainSVM(data,labels):
	usealgo = 1
	if usealgo == 0:
		from sklearn.linear_model import PassiveAggressiveClassifier
		clf=PassiveAggressiveClassifier(class_weight='balanced',n_jobs=-1,n_iter=15,fit_intercept=True)
	elif usealgo ==1:
		clf = SVC(probability= True,decision_function_shape='ovr',random_state=np.random.randint(1000),kernel="linear")

	elif usealgo ==2:
		from sklearn.svm import LinearSVC
		clf = LinearSVC()

	clf.fit(data,labels)
	return clf
Exemplo n.º 39
0
class DeployedClassifierFactory:
	def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None):
		'''This is a class that enables one to train and save a classification model.

		Parameters
		----------
		term_doc_matrix : TermDocMatrix
		term_doc_matrix_factory : TermDocMatrixFactory
		category : str
			Category name
		nlp : spacy parser
		'''
		self._term_doc_matrix = term_doc_matrix
		self._term_doc_matrix_factory = term_doc_matrix_factory
		assert term_doc_matrix_factory._nlp is None
		assert term_doc_matrix_factory.category_text_iter is None
		self._category = category
		self._clf = None
		self._proba = None

	def passive_aggressive_train(self):
		'''Trains passive aggressive classifier

		'''
		self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0)
		self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
		y_dist = self._clf.decision_function(self._term_doc_matrix._X)
		pos_ecdf = ECDF(y_dist[y_dist >= 0])
		neg_ecdf = ECDF(y_dist[y_dist <= 0])

		def proba_function(distance_from_hyperplane):
			if distance_from_hyperplane > 0:
				return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
			elif distance_from_hyperplane < 0:
				return pos_ecdf(distance_from_hyperplane) / 2.
			return 0.5

		self._proba = proba_function
		return self

	def build(self):
		'''Builds Depoyed Classifier
		'''
		if self._clf is None:
			raise NeedToTrainExceptionBeforeDeployingException()
		return DeployedClassifier(self._category,
		                          self._term_doc_matrix._category_idx_store,
		                          self._term_doc_matrix._term_idx_store,
		                          self._term_doc_matrix_factory)
def test_classifier_correctness(loss):
    y_bin = y.copy()
    y_bin[y != 1] = -1

    clf1 = MyPassiveAggressive(
        C=1.0, loss=loss, fit_intercept=True, n_iter=2)
    clf1.fit(X, y_bin)

    for data in (X, X_csr):
        clf2 = PassiveAggressiveClassifier(
            C=1.0, loss=loss, fit_intercept=True, max_iter=2,
            shuffle=False, tol=None)
        clf2.fit(data, y_bin)

        assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            for average in (False, True):
                clf = PassiveAggressiveClassifier(
                    C=1.0, max_iter=30, fit_intercept=fit_intercept,
                    random_state=1, average=average, tol=None)
                clf.fit(data, y)
                score = clf.score(data, y)
                assert_greater(score, 0.79)
                if average:
                    assert hasattr(clf, 'average_coef_')
                    assert hasattr(clf, 'average_intercept_')
                    assert hasattr(clf, 'standard_intercept_')
                    assert hasattr(clf, 'standard_coef_')
def test_classifier_correctness():
    y_bin = y.copy()
    y_bin[y != 1] = -1

    for loss in ("hinge", "squared_hinge"):

        clf1 = MyPassiveAggressive(C=1.0,
                                   loss=loss,
                                   fit_intercept=True,
                                   n_iter=2)
        clf1.fit(X, y_bin)

        clf2 = PassiveAggressiveClassifier(C=1.0,
                                           loss=loss,
                                           fit_intercept=True,
                                           n_iter=2)
        clf2.fit(X, y_bin)

        assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
def test_class_weights():
    # Test class weights.
    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                   [1.0, 1.0], [1.0, 0.0]])
    y2 = [1, 1, 1, -1, -1]

    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight=None,
                                      random_state=100)
    clf.fit(X2, y2)
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))

    # we give a small weights to class 1
    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100,
                                      class_weight={1: 0.001},
                                      random_state=100)
    clf.fit(X2, y2)

    # now the hyperplane should rotate clock-wise and
    # the prediction on this point should shift
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
Exemplo n.º 44
0
def main():
    #stemmer = SnowballStemmer('english')
    #stemmer = EnglishStemmer()

    training_data=open('trainingdata.txt', 'rU')
    n = int(training_data.readline().strip())    
    
    train_data = []
    class_data = []

    for i in range(n):
        line = training_data.readline().strip()
        train_data.append(line[1:].strip())
        class_data.append(int(line[0]))
        
    train_data = np.array(train_data)
    class_data = np.array(class_data)


    # 2) Vectorize bag of words
    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True )
    vectorizer.fit(train_data)
    X_train = vectorizer.transform(train_data)
        
  
    
    # Read test data from input
    X_test = np.array([raw_input().strip() for i in range(int(raw_input().strip()))])

    X_test = vectorizer.transform(X_test)

    clf = PassiveAggressiveClassifier(n_iter=9) 
    
    clf.fit(X_train, class_data)
    
    pred = clf.predict(X_test)
    for i in pred:
        print i
	def test_main(self):
		categories, documents = get_docs_categories()
		clean_function = lambda text: '' if text.startswith('[') else text
		entity_types = set(['GPE'])
		term_doc_mat = (
			TermDocMatrixFactory(
				category_text_iter=zip(categories, documents),
				clean_function=clean_function,
				nlp=_testing_nlp,
				feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
			).build()
		)
		clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
		fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
		                   clean_function=clean_function,
		                   feats_from_spacy_doc=FeatsFromSpacyDoc(
			                   entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
		tfidf = TfidfTransformer(norm='l1')
		X = tfidf.fit_transform(term_doc_mat._X)
		clf.fit(X, term_doc_mat._y)
		X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
		pred = clf.predict(tfidf.transform(X_to_predict))
		dec = clf.decision_function(X_to_predict)
class PassiveAgressiveClassifier(Classifier):
    def __init__(self, matrixdatabase):
        self._matrix_database = matrixdatabase
        self._has_fit = False
        self._occ = OCC(C=0.0083, n_iter=27, loss="hinge")

    def learn(self, ingredients, cuisine):
        return

    def classify(self, ingredients):
        if not self._has_fit:
            matrix, classes = self._matrix_database.make_train_matrix()
            self._occ = self._occ.fit(matrix, classes)
            print "Fitting complete..."
            self._has_fit = True
        output = self._occ.predict(self._matrix_database.make_row_from_recipe(ingredients))
        return output[0]
def test_equal_class_weight():
    X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
    y2 = [0, 0, 1, 1]
    clf = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight=None)
    clf.fit(X2, y2)

    # Already balanced, so "balanced" weights should have no effect
    clf_balanced = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
                                               class_weight="balanced")
    clf_balanced.fit(X2, y2)

    clf_weighted = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
                                               class_weight={0: 0.5, 1: 0.5})
    clf_weighted.fit(X2, y2)

    # should be similar up to some epsilon due to learning rate schedule
    assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
    assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
Exemplo n.º 48
0
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


clf = MultinomialNB() 
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


# Testing
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(n_iter=50)
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])



clf = MultinomialNB(alpha=0.1)
last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
Exemplo n.º 49
0
    testX = getFeatures(testTweets, countVecNGram, dictVec)

    percScoresTrain = []
    percScoresDev = []
    for i in range(10):
        perceptron.fit(trainX, trainY)
        percScoresDev.append(perceptron.score(devX, devY))
        percScoresTrain.append(perceptron.score(trainX, trainY))

    print "Perceptron Train:", np.mean(percScoresTrain)
    print "Perceptron Dev:", np.mean(percScoresDev)
    
    passAggScoresTrain = []
    passAggScoresDev = []
    for i in range(10):
        passAgg.fit(trainX, trainY) 
        passAggScoresDev.append( passAgg.score(devX, devY))
        passAggScoresTrain.append( passAgg.score(trainX, trainY))


    print "Passive Aggressive Train:", np.mean(passAggScoresTrain)
    print "Passive Aggressive Dev:", np.mean(passAggScoresDev)

    
    passAggScoresSmallTrain = []
    passAggScoresSmallDev = []
    for i in range(10):
        passAgg.fit(trainX, trainY) 
        passAggScoresSmallDev.append( passAgg.score(devX, devY))
        passAggScoresSmallTrain.append( passAgg.score(trainXSmall,trainYSmall))
        targets.append(int(line[0]))
        docs.append(' '.join([i for i in line[1:] if not is_stopword(i)]))



count_vect = CountVectorizer(input='content',ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(docs)
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
#svd = TruncatedSVD(n_components=55, random_state=7)
#X_train = svd.fit_transform(X_train_tf)
#clf = KNeighborsClassifier(n_neighbors=8).fit(X_train, targets)
#clf = BernoulliNB(alpha=.01)
#clf = LinearSVC()
clf=PassiveAggressiveClassifier(n_iter=9)
clf.fit(X_train_tf, targets)

def classify(content):
    global count_vect
    global tf_transformer
    global svd
    global clf
    X_new_counts = count_vect.transform(content)
    X_new_tfidf = tf_transformer.transform(X_new_counts)
    #X_new = svd.transform(X_new_tfidf)
    return clf.predict(X_new_tfidf)

tc = int(raw_input())
inp = []
for tcc in range(tc):
    x = raw_input()
Exemplo n.º 51
0
#print X_train_tfidf.shape

ntest = input()
testdoc = []
for t in range(0, ntest):
    doc = raw_input()
    testdoc.append(doc)

X_new_counts = count_vect.transform(testdoc)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
""""
#Naive bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)

#test random forest

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)
"""
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=50)
clf = clf.fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)

for t in range(0, ntest):
    print predicted[t]
Exemplo n.º 52
0
#https://www.hackerrank.com/challenges/document-classification/submissions/code/10577787
# Enter your code here. Read input from STDIN. Print output to STDOUT
documents=[]
target=[]
cnt=0
from sklearn.linear_model import PassiveAggressiveClassifier
with open("trainingdata.txt","rb") as infile:
    for line in infile:
        if cnt==0:
            cnt=1
            continue
        category=int(line[0:2])
        doc=line[2:]
        target.append(category)
        documents.append(doc)
from sklearn.feature_extraction.text import TfidfVectorizer
transformer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word',stop_words='english')
X = transformer.fit_transform(documents)
from sklearn.naive_bayes import MultinomialNB
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(X, target)

n=int(raw_input())
for i in range(0,n):
    X=transformer.transform([raw_input()])
    print(clf.predict(X))[0]

def runLearner(printStages = True, useSelector = False, discreteHelpfulness = True, useRST = True, useFew = False):
    learner = PassiveAggressiveClassifier() if discreteHelpfulness else PassiveAggressiveRegressor()
    #bestwords = getBestWords(instances,num=1000)
    tfidvec = TfidfVectorizer(sublinear_tf=True,stop_words='english', ngram_range=(1,3), decode_error='replace')
    selector = SelectKBest(chi2, k=50000) if useSelector else None
    encoder = LabelEncoder() if discreteHelpfulness else None
    if discreteHelpfulness:
        classlabels = encoder.fit_transform(labels)
    newData = False

    count = 0
    if useRST:
      print 'Getting RST data'
      nums, texts, ilabels = getPickledRSTSciKitDataLists(True) if newData else getRSTSciKitDataLists(True)

      random = RandomFeatureExtractor()
      lengthBaseline = LenFeatureExtractor()
      fullRST = FullPickledRSTFeatureExtractor(nums)  if newData else FullTextRSTFeatureExtractor(nums)
      limitedRST = LimitedPickledRSTFeatureExtractor(nums)  if newData else LimitedTextRSTFeatureExtractor(nums)
      vectorizer =  FeatureUnion([('extra',limitedRST),('tfid',tfidvec)])

      print 'Fitting random features baseline'
      random.fit(texts)
      print 'Fitting text length baseline'
      lengthBaseline.fit(texts)
      print 'Fitting full RST features'
      fullRST.fit(texts)
      print 'Fitting limited RST features'
      limitedRST.fit(texts)
      print 'Fitting limited RST with tfidvec features'
      vectorizer.fit(texts)
      print 'Fitting tfidvec features'
      tfidvec.fit(texts)

      split = int(0.8*len(ilabels))
      trainData = (texts[:split],ilabels[:split])
      testData = (texts[split:],ilabels[split:])      

      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      print 'random features baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      dummy = DummyClassifier()
      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      dummy.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],random,encoder,selector)
      print 'Dummy label distribution baseline trained on %d instances has accuracy %f'%(len(trainData[0]),dummy.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],lengthBaseline,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],lengthBaseline,encoder,selector)
      print 'text length baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],fullRST,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],fullRST,encoder,selector)
      print 'Full RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],limitedRST,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],limitedRST,encoder,selector)
      print 'Limited RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],vectorizer,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],vectorizer,encoder,selector)
      print 'Limited RST with ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],tfidvec,encoder,selector)
      learner = learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],tfidvec,encoder,selector)
      print 'ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))


    else:
      vectorizer = tfidvec
      testData = None
      vocabGotten = False
      instances = ([],[])
      numVocab = 50000
      numTest = 50000
      numTrain = 100000
      maxTrainStages = 20
      for text,label in getSciKitData(stateProgress = False, discreteLabels=discreteHelpfulness):
          if label!='few' or useFew:
            instances[0].append(text)
            instances[1].append(label)
            if not vocabGotten and len(instances[0]) == numVocab:
                if printStages:
                    print 'Fitting vocabulary with %d instances'%numVocab
                vectorizer.fit(instances[0],None)
                if selector is not None:
                    X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,None)
                    selector.fit(X,y)
                vocabGotten = True
                instances = ([],[])
            elif vocabGotten and testData is None and len(instances[0]) == numTest:
                if printStages:
                    print 'Getting test data with %d instances'%numTest
                testData = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector)
                instances = ([],[])
            elif vocabGotten and testData is not None and len(instances[0]) == numTrain:
                X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector)
                if discreteHelpfulness:
                    learner = learner.partial_fit(X,y, classes = classlabels)
                else:
                    learner = learner.partial_fit(X,y)
                instances = ([],[])
                count = count + 1
                if printStages:
                    print 'Baseline trained on %d instances has accuracy %f'%(count*numTrain,learner.score(testData[0],testData[1]))
            elif count == maxTrainStages:
                break
      print 'Final learner trained on %d instances has accuracy %f'%(maxTrainStages*numTrain,learner.score(testData[0],testData[1]))
#y = [1,1,1,1]
trans = vectorizer.fit_transform(x)
#print  vectorizer.transform(["I am in a tree tree"]).toarray()
#print vectorizer.get_feature_names()
#print trans.toarray()
#print sorted(vectorizer.vocabulary_)
print len(vectorizer.vocabulary_)


K = 1
from sklearn import neighbors
#clf = neighbors.KNeighborsClassifier(K,weights = 'distance', leaf_size= 30)

from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(trans, y)

#f = open("testDatatextClassification.txt",'r')
f = open("input01.txt",'r')
f2 = open("output01.txt","r")
d = f.readlines()
d = d[1:]
ans = map(int,f2.readlines())
t0= time.clock()

summing = 0;
for j,i in enumerate(d):
    sol = int(clf.predict(vectorizer.transform([i]).toarray())[0])
    #print sol, ans[j]
    if (sol==ans[j]):
        summing = summing + 1
Exemplo n.º 55
0
  def run(self, nFold=3, loss='hinge', iter=10, verbose=1):
    log.debug("PA: run")
    (numx, numy) = self._network.shape

    pp = permutation(numx)
		
    model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose)
    model.fit(self._network, self._annotation.ravel())
    scores = model.decision_function(self._network)
    self._scores = self._convertScore(scores)

    fold = 0
    offset = 0
    meanroc = []
    labelIx = range(numx)
    while fold < nFold:
      log.debug("NV: ___ fold= %d ___" % fold)
      lastelem = int(min(numx, offset+floor(numx/nFold)))

      ix = []
      for index in pp[offset+1:lastelem]:
        ix.append(index)

      print lastelem

      offset = lastelem

      labeltmp = []
      for value in self._annotation:
        labeltmp.append(float(value))

      for index in ix:
        labeltmp[index] = 0

      model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose)
      model.fit(self._network, labeltmp)
      scores = model.decision_function(self._network)
      scores = self._convertScore(scores)

      score = []
      label = []
      protein = []
      for index in ix:
        score.append(float(scores[index]))
        label.append(int(self._annotation[index]))
        protein.append(int(self._proteinid[index]))

        self._foldlabels.append(int(self._annotation[index]))
        self._foldscores.append(float(scores[index]))
        self._foldproteins.append(int(self._proteinid[index]))

      auroc = self.AUROC(label, score)
      log.debug("AUROC= %.4f" % auroc)

      meanroc.append(auroc)

      fold += 1

    self._auroc = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0)
    auroc = self.AUROC(self._foldlabels, self._foldscores)

    self._TPR_FPR(self._foldlabels, self._foldscores)
Exemplo n.º 56
0
X_train_select = sfm.transform(X_train)
X_test_select = sfm.transform(X_test)

# test with new clf
clf1 = PassiveAggressiveClassifier(C=0.5, n_iter=200, loss='hinge',random_state = 42)

benchmark(clf1, X_train_select, y_train, X_test_select, y_test)

# GridSearch for C
# Set the parameters by cross-validation
tuned_parameters = [{'C': np.logspace(-6, 0, 1000)}]

score = 'accuracy'

print("# Tuning hyper-parameters for %s" % score)
print()

clf = GridSearchCV(clf1, tuned_parameters, cv=cv)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() * 2, params))
print()
Exemplo n.º 57
0
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)

result1 = tpot_data.copy()

# Perform classification with a passive aggressive classifier
pagr1 = PassiveAggressiveClassifier(C=0.81, loss="squared_hinge", fit_intercept=True, random_state=42)
pagr1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)

result1['pagr1-classification'] = pagr1.predict(result1.drop('class', axis=1).values)
ls=[]
for _ in range(n):  # change this to n in stead of 3
    x=f.readline()
    xs=x[2:]
    xn=x[0]
    ls.append(xs)
    ln.append(xn)
#stem the words
bag_of_words=vectorizer.fit(ls)
bag_of_words=vectorizer.transform(ls)
cmax=0
for cc in range(1,100):
    #sw=stopwords.words() #stopwords are not supported, requires download
    clf = PassiveAggressiveClassifier(n_iter=9,C=cc/10)
#    svm=LinearSVC(C=cc/10.0)
    clf.fit(bag_of_words,ln)
    
    #Now get input (test) data
    lt=[]
    filename=open("testdata.txt")
    line = filename.readline()
    ntests=int(line)
    for _ in range(ntests):
        lt.append(filename.readline())
    
    bag_of_test_words=vectorizer.transform(lt)
    result=clf.predict(bag_of_test_words)
    actuals=[]
    filename=open("testresults.txt")
    z=0
    for x in range(len(result)):