예제 #1
0
def funcn():
    f = open("amazon_data.txt")
    pos_tweets = list()
    neg_tweets = list()
    for line in f:
        words = line.split("\t")
        if words[1] == '0\n' or words[1] == '0':
            neg_tweets.append(words)
        else:
            pos_tweets.append(words)
    f.close()

    tweets = []
    for (words, sentiment) in pos_tweets + neg_tweets:
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        tweets.append((words_filtered, sentiment))

    def get_words_in_tweets(tweets):
        all_words = []
        for (words, sentiment) in tweets:
            all_words.extend(words)
        return all_words

    def get_word_features(wordlist):
        wordlist = nltk.FreqDist(wordlist)
        word_features = wordlist.keys()
        return word_features

    word_features = get_word_features(get_words_in_tweets(tweets))

    def extract_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    training_set = nltk.classify.apply_features(extract_features, tweets)
    classifie = nltk.NaiveBayesClassifier.train(training_set)

    classifier = SklearnClassifier(BernoulliNB()).train(training_set)

    tweet = 'it is not bad'
    print(classifie.classify(extract_features(tweet.split())))
    print(classifier.classify(extract_features(tweet.split())))

    classif = SklearnClassifier(SVC(), sparse=False).train(training_set)
    print(classif.classify(extract_features(tweet.split())))
예제 #2
0
def treina_classificadores():
    posdados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_1.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            posdados.append(val[0])
    negdados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_0.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            negdados.append(val[0])
    neudados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_2.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            neudados.append(val[0])
    negfeats = [(bag_of_words(f), 'neg') for f in divide(negdados)]
    posfeats = [(bag_of_words(f), 'pos') for f in divide(posdados)]
    neufeats = [(bag_of_words(f), 'neu') for f in divide(neudados)]
    treino = negfeats + posfeats + neufeats
    #'Maximum Entropy'
    classificadorME = MaxentClassifier.train(treino,
                                             'GIS',
                                             trace=0,
                                             encoding=None,
                                             labels=None,
                                             gaussian_prior_sigma=0,
                                             max_iter=1)
    #SVM
    classificadorSVM = SklearnClassifier(LinearSVC(), sparse=False)
    classificadorSVM.train(treino)
    # Naive Bayes
    classificadorNB = NaiveBayesClassifier.train(treino)
    return ([classificadorME, classificadorSVM, classificadorNB])
예제 #3
0
def m_train():
    train = []
    with codecs.open('data/train_chunked_double.data',
                     mode='r',
                     encoding='UTF-8') as file:
        for line in file.readlines():
            line = line.strip('\n')
            line = line.strip('\r')
            pair = line.split(',')

            e = pair[0]
            z = pair[1]

            for j in range(len(z)):
                x = gen_x(e, z, j)
                y = z[j]
                train.append((x, y))

    try:
        clas = SklearnClassifier(
            LogisticRegression(solver='lbfgs', n_jobs=-1,
                               max_iter=200)).train(train)
        save_model(clas)
        return clas

    except Exception as e:
        print('Error: %r' % e)
        return None
예제 #4
0
 def test_bernollinb_returns_correct_result(self):
     train_data = [({
         "a": 4,
         "b": 1,
         "c": 0
     }, "ham"), ({
         "a": 5,
         "b": 2,
         "c": 1
     }, "ham"), ({
         "a": 0,
         "b": 3,
         "c": 4
     }, "spam"), ({
         "a": 5,
         "b": 1,
         "c": 1
     }, "ham"), ({
         "a": 1,
         "b": 4,
         "c": 3
     }, "spam")]
     classif = SklearnClassifier(BernoulliNB()).train(train_data)
     test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}]
     ccm = classif.classify_many(test_data)
     ['ham', 'spam']
     self.assertEqual(ccm, ['ham', 'spam'])
예제 #5
0
def trainClassifier(trainData):
    
    pipeline = Pipeline([('svc', LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))])
    return SklearnClassifier(pipeline).train(trainData)
예제 #6
0
def searchLinearSVC(title, train_departments):
    """
    Linear SVC
    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    #classifier = SklearnClassifier(LinearSVC(probability=True))
    classifier = SklearnClassifier(SVC(kernel='linear', probability=True))
    classifier.train(train_departments)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    probability = classifier.prob_classify(test_sent_features)
    print(probability.prob(found_department))

    return [
        found_department,
        probability.prob(found_department),
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
예제 #7
0
def searchSGDClassifier_classifier(title, train_departments):
    """

    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    classifier = SklearnClassifier(SGDClassifier(loss='log'))
    classifier.train(train_departments)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    probability = classifier.prob_classify(test_sent_features)
    print(probability.prob(found_department))

    return [
        found_department,
        probability.prob(found_department),
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
예제 #8
0
파일: test_svc.py 프로젝트: Lingwars/GAPLEN
 def test_svc_returns_correct_result(self):
     train_data = [({
         "a": 4,
         "b": 1,
         "c": 0
     }, "ham"), ({
         "a": 5,
         "b": 2,
         "c": 1
     }, "ham"), ({
         "a": 0,
         "b": 3,
         "c": 4
     }, "spam"), ({
         "a": 5,
         "b": 1,
         "c": 1
     }, "ham"), ({
         "a": 1,
         "b": 4,
         "c": 3
     }, "spam")]
     classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
     test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}]
     ccm = classif.classify_many(test_data)
     self.assertEqual(ccm, ['ham', 'spam'])
예제 #9
0
def run_program(is_testing, mode):
    """########## CHECKING WHAT THE PROGRAM IS GOING TO EXECUTE ##########"""
    print(" ")
    print(print_vals(is_testing, mode))
    """###################################################################"""

    iteration = 0

    file_path = ''
    if is_testing:
        file_path = 'Data/datasets/test.csv'
    else:
        file_path = 'Data/datasets/training.csv'

    load_csv(file_path, mode)
    features = feature_choices()
    number_of_labels = int(len(labels))
    iteration = 0
    weighted_data = select_features(features)
    print("Training Classifier: ")
    classifier = SklearnClassifier(
        LinearSVC(loss='squared_hinge', max_iter=999999)).train(weighted_data)

    # make_predictions()
    return None
def train_and_save_model(data_set_name="NB_Model_Tatoeba_", n=2):
    trainingset = []
    for i, label in enumerate(targets):
        featurs = text_features(data[i], n)
        trainingset.append((featurs, label))
    classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
    save(data_set_name + str(n) + "n", classifier)
    return classifier
예제 #11
0
 def train_using_SklearnClassifier(self, training_data, test_data):
     #   Giving bad results. Don't use.
     classifier = SklearnClassifier(BernoulliNB()).train(training_data)
     classifier2 = SklearnClassifier(SVC(),
                                     sparse=False).train(training_data)
     print(classifier)
     classifier_name = type(classifier).__name__
     training_set_accuracy = nltk.classify.accuracy(classifier,
                                                    training_data)
     training_set_accuracy2 = nltk.classify.accuracy(
         classifier2, training_data)
     test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
     test_set_accuracy2 = nltk.classify.accuracy(classifier2, test_data)
     print(">>>>>>>>")
     print(training_set_accuracy, test_set_accuracy)
     print(training_set_accuracy2, test_set_accuracy2)
     return classifier, classifier_name, test_set_accuracy, training_set_accuracy
def LG_gender(train_set, test_set):
    print('== SkLearn MaxEnt ==')
    from nltk.classify import SklearnClassifier
    from sklearn.linear_model import LogisticRegression
    sklearn_classifier = SklearnClassifier(
        LogisticRegression(C=10e5)).train(train_set)
    print(sklearn_classifier.prob_classify(gender_features('mark'))._prob_dict)
    print(nltk.classify.accuracy(sklearn_classifier, test_set))
예제 #13
0
def main():
    """Main."""
    from sklearn.svm import SVC
    from nltk.classify import SklearnClassifier

    classifier = SklearnClassifier(SVC(kernel="rbf"), sparse=False)

    _train(classifier)
    _test(classifier)
예제 #14
0
def LG_gender(train_set):
    print('== SkLearn MaxEnt ==')

    from nltk.classify import SklearnClassifier
    from sklearn.linear_model import LogisticRegression

    sklearn_classifier = SklearnClassifier(
        LogisticRegression(C=10e5)).train(train_set)
    return sklearn_classifier
예제 #15
0
def searchNuSVC_classifier(title, train_departments):
    """
    Nu-Support Vector Classification.
    :param title:
    :param train_departments:
    :return:
    """
    classifier = SklearnClassifier(NuSVC())
    classifier.train(train_departments)
    test_sent_features = word_feats(title)
    return classifier.classify(test_sent_features)
예제 #16
0
파일: classify.py 프로젝트: ksuhr1/CMPS143
def train_scikit_model(best_features, feature_set, split_name,
                       classifier_name):

    #train on the training data of word_features

    #find which classifier model to use
    if classifier_name == "nb":
        cls = nltk.classify.NaiveBayesClassifier.train(best_features)
    elif classifier_name == "nb_sk":
        cls = SklearnClassifier(BernoulliNB()).train(best_features)
    elif classifier_name == "dt":
        cls = nltk.classify.DecisionTreeClassifier.train(best_features)
    elif classifier_name == "dt_sk":
        cls = SklearnClassifier(
            tree.DecisionTreeClassifier()).train(best_features)
    elif classifier_name == "svm_sk" or classifier_name == "svm":
        cls = SklearnClassifier(svm.SVC())
    else:
        assert False, "unknown classifier name:{}; known names: nb, dt, svm, nb_sk, dt_sk, svm_sk".format(
            classifier_name)
    return cls
예제 #17
0
def read(filename):
    fp = open(filename, "r")
    f = fp.readlines()
    vocab = [s.encode('utf-8').split() for s in f]
    #print vocab
    voc_vec = word2vec.Word2Vec(vocab, min_count=1, size=4)
    #print voc_vec.syn0.shape
    #print type(voc_vec['yav'])
    #Openning data file
    fp.close()
    fp = open("test_data.txt", "r")
    f = fp.read()
    tokens = nltk.word_tokenize(f)
    D = OrderedDict()
    sentences = []
    #print len(tokens)
    for word in tokens[0:200]:
        D[word.split("|")[0]] = word.split("|")[1]
        sentences.append(word.split("|")[0])
    #print D

    train_data = []

    for key in D:
        l = voc_vec[key]
        x = {}
        x['a'] = l[0]
        x['b'] = l[1]
        x['c'] = l[2]
        x['d'] = l[3]
        train_data.append((x, D[key]))
    classif = SklearnClassifier(BernoulliNB()).train(train_data)
    #print train_data

    test_data = []
    D2 = OrderedDict()
    for word in tokens[200:300]:
        D2[word.split("|")[0]] = word.split("|")[1]
    expected_list = []
    for key in D2:
        l = voc_vec[key]
        x = {}
        x['a'] = l[0]
        x['b'] = l[1]
        x['c'] = l[2]
        x['d'] = l[3]
        test_data.append(x)
        expected_list.append(D2[key])
    predicted = classif.classify_many(test_data)
    print len(predicted)
    print len(expected_list)
    print accuracy_score(expected_list, predicted, normalize=False)
예제 #18
0
def predict_nltk(in_text='', n=2):
    ''' Text language classification
        Then use scikit-learn classifiers from within NLTK 
        to classify new taxt based on training set.
    '''
    trainingset = []
    for label in text:
        featurs = text_features(text[label])
        trainingset.append((featurs, label))
    classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
    in_features = text_features(in_text, n=n)
    lang = classifier.classify(in_features)
    print 'Language:', lang
def create_classifier(featx):

    pos_data = pickle.load(
        open(os.path.join(config.pkl_path, 'pos_reviews.pkl'), 'rb'))
    neg_data = pickle.load(
        open(os.path.join(config.pkl_path, 'neg_reviews.pkl'), 'rb'))

    pos_test_data = pickle.load(
        open(os.path.join(config.pkl_path, 'test_pos_reviews.pkl'), 'rb'))
    neg_test_data = pickle.load(
        open(os.path.join(config.pkl_path, 'test_neg_reviews.pkl'), 'rb'))

    print len(pos_data), '---++---', len(neg_data)
    pos_features = [(featx(w_lst), 'pos') for w_lst in pos_data]
    neg_features = [(featx(w_lst), 'neg') for w_lst in neg_data]

    pos_test_features = [(featx(w_lst), 'pos') for w_lst in pos_test_data]
    neg_test_features = [(featx(w_lst), 'neg') for w_lst in neg_test_data]

    pos_features.extend(neg_features)
    train_set = pos_features

    pos_test_features.extend(neg_test_features)
    test_set = pos_test_features

    print train_set is None, '---train_set----', len(train_set)
    print test_set is None, '-----test_set--', len(test_set)
    """
    训练两个分类器
    """
    nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
    nba = nltk.classify.accuracy(nb_classifier, test_set)
    print "NBayes accuracy is %.7f" % nba  # 86.78%

    svm_classifier = SklearnClassifier(LinearSVC()).train(train_set)
    svmm = nltk.classify.accuracy(svm_classifier, test_set)
    print "svm_classifier accuracy is %.7f" % svmm  # 89.124%
    """
    保存准确率更大的那个模型
    """
    classifier_pkl = os.path.join(config.pkl_path, 'my_classifier.pkl')  # 消极语料
    with open(classifier_pkl, 'wb') as f:
        if nba > svmm:
            pickle.dump(nb_classifier, f)
            print 'NBayes'
        else:
            pickle.dump(svm_classifier, f)
            print 'SVM'

    print 'done!'
예제 #20
0
 def __init__(self, classifier_type='NaiveBayes', feats=word_feats):
     # "Thumbs up? Sentiment Classification using Machine Learning Techniques
     classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM']
     if classifier_type in classifier_list:
         self.classifier_type = classifier_type
     else:
         print("Classifier Type is not implemented: " + classifier_type)
     if self.classifier_type == 'MaximumEntropy':
         self.classifier = MaxentClassifier
     elif self.classifier_type == 'SVM':
         self.classifier = SklearnClassifier(LinearSVC(), sparse=False)
     elif self.classifier_type == 'NaiveBayes':
         self.classifier = NaiveBayesClassifier
     self.feats = feats
def classifyUsingSVM(feature):
    # Define folds for cross validation
    kf = cross_validation.KFold(len(feature), n_folds=5, shuffle=False);
    features = numpy.array(feature);
    max2 = 0;
    SVMmodel = SklearnClassifier(SVC());
    #logreg = linear_model.LogisticRegression(C=1e5)
    
    for x,y in kf:
        train_set_fold = features[x];
        test_set_fold = features[y];
        train_set = list(train_set_fold);
        test_set = list(test_set_fold);
        # SV Classifier
        classifier2 = SklearnClassifier(SVC()).train(train_set);
	#classifier3=MultinomialNB()
        #classifier3.train(train_set)
        accuracy2 = nltk.classify.accuracy(classifier2,test_set)*100;
	#accuracy3 = nltk.classify.accuracy(classifier3,test_set)*100;
        # Use the best model       
        if accuracy2 > max2:
            SVMmodel = classifier2;
    return SVMmodel;
예제 #22
0
def build_classifier(classifier_name):
    """
    Accepted names: nb, dt, svm, sk_nb, sk_dt, sk_svm

    svm and sk_svm will return the same type of classifier.

    :param classifier_name:
    :return:
    """
    if classifier_name == "nb":
        cls = nltk.classify.NaiveBayesClassifier
    elif classifier_name == "nb_sk":
        cls = SklearnClassifier(BernoulliNB())
    elif classifier_name == "dt":
        cls = nltk.classify.DecisionTreeClassifier
    elif classifier_name == "dt_sk":
        cls = SklearnClassifier(tree.DecisionTreeClassifier())
    elif classifier_name == "svm_sk" or classifier_name == "svm":
        cls = SklearnClassifier(svm.SVC())
    else:
        assert False, "unknown classifier name:{}; known names: nb, svm, nb_sk, dt_sk, svm_sk".format(
            classifier_name)
    return cls
예제 #23
0
def train(records):
    global CUR_CL
    train_data = []
    for record in records:
        text = record[1]
        class_label = record[0]
        feats = features_from_text(text, class_label, stopwords=sw)
        train_data.append(feats)
    if CUR_CL is None:
        if CLASSIFIER == 'NaiveBayesClassifier':
            classifier = NaiveBayesClassifier.train(train_data)
        elif CLASSIFIER == 'sklearnLinSVC':
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('nb', LinearSVC(multi_class='ovr'))])
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == 'BernoulliNB':
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('nb', BernoulliNB())])
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == 'MultinomialNB':
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('nb', MultinomialNB())])
            classifier = SklearnClassifier(pipeline).train(train_data)
        print CLASSIFIER
        CUR_CL = classifier
    else:
        print 'Partial fitting.. \n\n'
        CUR_CL.train(train_data)
    f = open("%s/%s.pickle" % (pickles_dir, 'news_based_' + CLASSIFIER), 'wb')
    pickle.dump(CUR_CL, f)
    f.close()
    print"%s/%s.pickle saved" % (pickles_dir, 'news_based_' + CLASSIFIER)

    gc.collect()
예제 #24
0
    def trainModel(self, size):
        neg_training = self.extract_features(self.mr, self.data['neg'][:size], 'neg',
                                    feature_extractor=self.unigram_features)

    
        pos_training = self.extract_features(self.mr, self.data['pos'][:size],'pos',
                                        feature_extractor=self.unigram_features)
              
        
        train_set = pos_training + neg_training
        
        
        classif = SklearnClassifier(SVC(), sparse=False).train(train_set)
        
        self.classif = classif
예제 #25
0
    def __init__(self, load_clf=False, load_tr_data=False):
        self.features = self.__load_support_vector_features()
        self.training_data = []
        self.n_samples = 0
        self.all_tweets = self.__load_tweets_from_file()  # list not dict

        # Classifier loading
        if load_clf:
            self.load_clf()
        else:
            self.clf = SklearnClassifier(SVC(), sparse=False)

        # Training Data loading
        if load_tr_data:
            self.__load_training_data()
예제 #26
0
 def SKClassifierSVM(self, dati):
     
     try:
         train, test=self.__CreaDatasetTrainTest(dati)
         classifier=SklearnClassifier(LinearSVC()).train(train)
         
         print "ACCURACY SVM:", nltk.classify.accuracy(classifier, test)
   
         return classifier
         
     except Exception, e:
         print 'errore in SVM'
         for i in e:
             print i
         print
예제 #27
0
    def __init__(self, kernel: str = "") -> None:
        # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        self.tokenizer = TweetTokenizer(preserve_case=False,
                                        reduce_len=True,
                                        strip_handles=True).tokenize

        # Here we create the pipeline for the classifier.
        # The TfidfTransformer is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        # The SVC sets up a Support Vector Machine classifier with the configured kernel.
        # In this case it is either a linear or a radial basis function kernel.
        # The details for the above items are discussed in the model's readme.
        pipeline = Pipeline([('tfidf', TfidfTransformer()),
                             ('{}svc'.format(kernel), SVC(kernel=kernel))])
        self.classif = SklearnClassifier(pipeline)
예제 #28
0
    def _train(self):
        pickle_filename = "{0}.pickle".format(self.__class__.__name__)
        if os.path.isfile(pickle_filename):
            with open(pickle_filename, "rb") as classifier_f:
                self._classifier = pickle.load(classifier_f)
            classifier_f.close()
        else:
            train_set = [(self._extract_features(cascade), cascade['label'])
                         for cascade in self._dataset]
            gbc_clf = GradientBoostingClassifier(n_estimators=1000)
            self._classifier = SklearnClassifier(gbc_clf,
                                                 sparse=False).train(train_set)

            with open(pickle_filename, "wb") as save_classifier:
                pickle.dump(self._classifier, save_classifier)

            save_classifier.close()
예제 #29
0
    def _train(self):
        pickle_filename = "{0}.pickle".format(self.__class__.__name__)
        if os.path.isfile(pickle_filename):
            with open(pickle_filename, "rb") as classifier_f:
                self._classifier = pickle.load(classifier_f)
            classifier_f.close()
        else:
            train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset]
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('rf', SVC(kernel='linear', probability=True))])
            self._classifier = SklearnClassifier(pipeline, sparse=False).train(train_set)

            with open(pickle_filename, "wb") as save_classifier:
                pickle.dump(self._classifier, save_classifier)

            save_classifier.close()
예제 #30
0
    def __init__(self):
        self.pre_pro = TweetPreprocessor()
        self.classifier = SklearnClassifier(MultinomialNB(alpha=1.375))

        neg_twts = [(self.process_tweet(twt), "negative")
                    for twt in twitter_samples.strings('negative_tweets.json')]

        pos_twts = [(self.process_tweet(twt), "positive")
                    for twt in twitter_samples.strings('positive_tweets.json')]

        all_twts = neg_twts + pos_twts

        acc_scores, confusion_matrix = self.cross_validate(self.classifier, all_twts, 10)
        self.classifier.train(all_twts)
        print("Initialised classifier with an accuracy of {:.2f}%, +/- {:.2f}%"
              .format(mean(acc_scores) * 100, stdev(acc_scores) * 2 * 100))
        print("Confusion matrix: \n{}".format(confusion_matrix))