Exemplo n.º 1
0
 def __init__(self, **config):
     linfo('config: %s' % config)
     self._path = config['train_path']
     self._feature_extract_config = config['feature']
     self._emoticon = config['emoticon']
     self.test_path = '../../test_data/bi_test_data'
     self.clf = BNB(fit_prior=True)
    def train(self):
        logging.info('-' * 20)
        logging.info('Start training the %s model', self.model)
        train_data = self.feature_extractor.extract_feature(
            self.data_loader.get_trainset())
        if self.model == 'GNB':
            # Gaussian naive bayes
            self.classifier = GNB()
        elif self.model == 'BNB':
            # Bernoulli naive bayes
            self.classifier = BNB()
            # self.tok = RT(r'\w+')
            # vectorizer = Vectorizer(tokenizer=self.tok.tokenize)
            # train_data = self.data_loader.get_trainset()
            # train_data = [vectorizer.fit_transform(train_data[0]).toarray(), train_data[1]]
            # self.vocabulary = vectorizer.get_feature_names()
        elif self.model == 'MNB':
            # Multinomial naive bayes
            self.classifier = MNB()
        elif self.model == 'LR':
            # Logistic regression
            param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]}
            self.classifier = GS(cv=5,
                                 estimator=LR(penalty=self.penalty,
                                              max_iter=self.epoch,
                                              solver='liblinear'),
                                 param_grid=param)
        elif self.model == 'SVM':
            # Support vector machine
            self.penalty = self.penalty if self.penalty in ['l1', 'l2'
                                                            ] else 'l2'
            dual = self.penalty == 'l2'
            #self.classifier = SVM(penalty=self.penalty, C=self.c, max_iter=self.epoch, dual=dual)
            param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]}
            self.classifier = GS(cv=5,
                                 estimator=SVM(penalty=self.penalty,
                                               dual=dual,
                                               max_iter=self.epoch),
                                 param_grid=param)

        elif self.model == 'R':
            # RandomGuess
            self.classifier = DC(strategy='stratified')
        else:
            logging.info('Unsupported model : %s', self.model)
            exit(0)

        self.classifier.fit(train_data[0], train_data[1])
        self.classifier.predict(train_data[0])
        predictions = self.classifier.predict(train_data[0])
        acc = evaluator.accuracy_score(train_data[1], predictions)
        return acc
Exemplo n.º 3
0
  def __init__(self):
    """Create a persistent model file if there isn't one. If one exists, use it."""

    self.file_path = 'models/decision_tree.joblib'
    self.include_hunger = False
    self.accuracy_metric_float = 0.0
    self.accuracy = metrics.Accuracy()


    if path.exists(self.file_path):
      self.model = load(self.file_path)
    else:
      # self.model = compat.convert_sklearn_to_river(
      #     estimator=MLPClassifier(random_state=1, max_iter=300),
      #     classes=[0, 1]
      # )

      self.model = compat.convert_sklearn_to_river(
          estimator=BNB(binarize=.1),
          classes=[0, 1]
      )

      self.save_model()
Exemplo n.º 4
0
x_train, y_train = quest.readFileAndCut(quest.train_filename)
x_test, y_test = quest.readFileAndCut(quest.test_filename)

train_tfidf, test_tfidf = quest.train_tf(x_train, x_test)

def execClassify(name, model):
    classifier = Classifier(name, model)
    classifier.fit(train_tfidf, y_train)
    classifier.test(y_test, test_tfidf)

if args.m == 'mnb':
    from sklearn.naive_bayes import MultinomialNB as MNB
    execClassify(name='多项式朴素贝叶斯分类器', model=MNB())
elif args.m == 'bnb':
    from sklearn.naive_bayes import BernoulliNB as BNB
    execClassify(name='伯努利朴素贝叶斯分类器', model=BNB())
elif args.m == 'linearSVC':
    from sklearn.svm import LinearSVC
    execClassify(name='线性SVM分类器', model=LinearSVC())
elif args.m == 'dt':
    from sklearn.tree import DecisionTreeClassifier
    execClassify(name='决策树分类器', model=DecisionTreeClassifier())
elif args.m == 'knn':
    from sklearn.neighbors import KNeighborsClassifier
    execClassify(name='KNN分类器', model=KNeighborsClassifier())
elif args.m == 'xgb':
    import xgboost as xgb
    dtrain = xgb.DMatrix(train_tfidf, label=y_train)
    dtest = xgb.DMatrix(test_tfidf, label=y_test)  # label可以不要,此处需要是为了测试效果
    param = {'max_depth': 6, 'eta': 0.5, 'eval_metric': 'merror', 'silent': 1, 'objective': 'multi:softmax',
             'num_class': 10}  # 参数
Exemplo n.º 5
0
def naive_bayes(corpus_CV, classes):
    bnb = BNB()
    classes = np.ravel(classes)
    bnb.fit(corpus_CV, classes)
    return bnb
'''######'''
'''###############################################################################'''
'''#################################Step 7########################################'''
'''Training using Bernouli Naive-Bayesian model to learn a classifier on vital_seg'''
'''###############################################################################'''
'''
vital_seg = 2*number of vital_seg*1500
train = 2*number of vital_seg*1500
'''
train = []
labels = []
for cluster_n in range(number_books):
    for seg in vital_seg[cluster_n]:
        train.append(seg.tolist())
        labels.append(cluster_n)
model3 = BNB(fit_prior=True)
model3 = model3.fit(train, labels)
print "STEP 7 done"
'''######'''
'''Step 7'''
'''######'''
'''################################################################'''
'''##########################Step 8################################'''
'''classfying sentences on trained classifier and calculating score'''
'''################################################################'''
'''
auth_proba = [[.22, .05, .12, ... number of authors]
			  [.22, .05, .12, ... number of authors]
			  ...
			  test_size
			 ]											probability of a sentence written by authors
Exemplo n.º 7
0
folder = "C:/Users/hanfs/Desktop/data-mining-project/training_data_file.TF.txt"

#wondering if we should just hard code each path file but most likely because there is only 3
feature_vectors, targets = svmlight(folder)

###Lets Generate the Classifier items###
print("TF Data")
clf = MNB()
scores = cross_val_score(clf,
                         feature_vectors,
                         targets,
                         cv=5,
                         scoring='f1_macro')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

clf = BNB()
scores = cross_val_score(clf,
                         feature_vectors,
                         targets,
                         cv=5,
                         scoring='f1_macro')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

clf = KNeighborsClassifier()
scores = cross_val_score(clf,
                         feature_vectors,
                         targets,
                         cv=5,
                         scoring='f1_macro')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
import utils
import pickle

from os.path import isfile

from sklearn.naive_bayes import BernoulliNB as BNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

filename = '/usr/src/app/sentiment/models/pickles/BernoulliNB.pickle'

if isfile(filename) == False:

    train, test = train_test_split(utils.read_data(), test_size=0.2)

    train_embeddings = utils.combined_embeddings(train['text'].tolist())
    test_embeddings = utils.combined_embeddings(test['text'].tolist())

    clf = BNB(alpha=50)
    clf.fit(train_embeddings, train['sentiment'])

    prediction = clf.predict(test_embeddings)
    report = classification_report(test['sentiment'], prediction)
    print(report)

    with open(filename, 'wb') as f:
        pickle.dump(clf, f)

else:
    print('Already Trained!')
folderpath = r"D:\my_data"
folder = "C:/Users/khada/Desktop/data-mining-project/training_data_file.TF.txt"

feature_vectors, targets = load_svmlight_file(folder)
X = feature_vectors
X = X.astype(int)
y = targets
y = y.astype(int)

#print(y)

##X_new1 = SelectKBest(chi2, k=100).fit(X, y)#returns 100 of the best of feature vectors according to the chi squared test
##X_new2 = SelectKBest(mutual_info_classif, k=100).fit_transform(X, y)
##
clfm = MNB()
clfb = BNB()
clfk = KNeighborsClassifier()
clfs = SVC()
##scores = cross_val_score(clf, feature_vectors, targets, cv=5, scoring='f1_macro')
vals = [10, 25, 50, 100]

#(x-axis: K; y-axis:fl_macro)
##I believe the K is the x_new(s) and the f1_macro is the scoring items received from the classifier
for K in vals:
    X_new1 = SelectKBest(chi2, k=K).fit_transform(
        X, y
    )  #returns 100 of the best of feature vectors according to the chi squared test
    X_new2 = SelectKBest(mutual_info_classif, k=K).fit_transform(X, y)

    scoresm1 = cross_val_score(clfm, X_new1, targets, cv=K, scoring='f1_macro')
    scoresb1 = cross_val_score(clfb, X_new1, targets, cv=K, scoring='f1_macro')
Exemplo n.º 10
0
# ETsC : sklearn.ensemble.forest.ExtraTreesClassifier
from sklearn.ensemble.forest import RandomForestClassifier as RFC
from sklearn.ensemble.forest import ExtraTreesClassifier as ETsC
_Models = {
    "LR": LR(),
    "LRCV": LRCV(),
    "LDA": LDA(),
    "QDA": QDA(),
    "KNC": KNC(),
    # "RNC" : RNC(),
    "DTC": DTC(),
    "ETC": ETC(),
    "GNB": GNB(),
    # "BDNB" : BDNB(),
    "MNB": MNB(),
    "BNB": BNB(),
    "LSVC": LSVC(),
    "SVC": SVC(),
    "NSVC": NSVC(),
    # "OCSVM" : OCSVM()
}
# 审查结果比较
print("审查结果比较及其可视化...:")
_Algorithm_CMP_Results = []
_Algorithm_CMP_Result_List = []
_Result_File.write("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)\n")
print("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)")
for _Each in _Models:
    cv_results = model_selection.cross_val_score(_Models[_Each],
                                                 X=_X_Train,
                                                 y=_Y_Train,
# ### Not good

# In[ ]:

from sklearn.naive_bayes import MultinomialNB as MNB
mnb = MNB()
mnb.fit(df_train, out_train)
mnb.score(df_test, out_test)

# ### Not good yet

# In[ ]:

from sklearn.naive_bayes import BernoulliNB as BNB
bnb = BNB()
bnb.fit(df_train, out_train)
bnb.score(df_test, out_test)

# ### Better, but still not close to 90%

# #### So the Naive Bayes Classifier also doesnot do a good job in this task.
# #### Now I will try and experiment with Tree based classifiers.

# ## Decision Trees

# In[ ]:

from sklearn.cross_validation import cross_val_score as cvs
from sklearn.tree import DecisionTreeClassifier as dtree
tr = dtree()
path = '...'
os.chdir(path)

d = {}
test_size = 0.1

datasets = []
for i in os.listdir(os.getcwd()):
    datasets.append(i)

# set the ids of the folds that lead to a different random_state of kFold, leading to len(folds) * 10-cross-validation processes
folds = [1, 2, 3, 4, 5, 7, 23, 66, 123, 2018]
# else, set just a seed into the folds list for one only 10-cross-validation procedure
folds = [23]

learners = [SGD(loss= 'log') ,  SGD(loss= 'modified_huber'), SGD(loss= 'log' , penalty = 'l1') , SGD(loss= 'log' , penalty = 'elasticnet') , SGD(loss= 'modified_huber' , penalty = 'l1') , SGD(loss= 'modified_huber' , penalty = 'elasticnet') , MNB(), BNB()]

for t in learners:
    
    l = []
    print '#### \t' , t, '\t ####' 
    for x in range(0, len(datasets)):

        lea = copy.deepcopy(t)
        acc = []
        stdev = []
        
        dataframe = read_csv(datasets[x] , skiprows = 1 , header=None)
        dataframe = dataframe.dropna()
        dataset = dataframe.values
        print