예제 #1
0
def treina_classificadores():
    posdados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_1.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            posdados.append(val[0])
    negdados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_0.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            negdados.append(val[0])
    neudados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_2.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            neudados.append(val[0])
    negfeats = [(bag_of_words(f), 'neg') for f in divide(negdados)]
    posfeats = [(bag_of_words(f), 'pos') for f in divide(posdados)]
    neufeats = [(bag_of_words(f), 'neu') for f in divide(neudados)]
    treino = negfeats + posfeats + neufeats
    #'Maximum Entropy'
    classificadorME = MaxentClassifier.train(treino,
                                             'GIS',
                                             trace=0,
                                             encoding=None,
                                             labels=None,
                                             gaussian_prior_sigma=0,
                                             max_iter=1)
    #SVM
    classificadorSVM = SklearnClassifier(LinearSVC(), sparse=False)
    classificadorSVM.train(treino)
    # Naive Bayes
    classificadorNB = NaiveBayesClassifier.train(treino)
    return ([classificadorME, classificadorSVM, classificadorNB])
예제 #2
0
파일: test_svc.py 프로젝트: Lingwars/GAPLEN
 def test_svc_returns_correct_result(self):
     train_data = [({
         "a": 4,
         "b": 1,
         "c": 0
     }, "ham"), ({
         "a": 5,
         "b": 2,
         "c": 1
     }, "ham"), ({
         "a": 0,
         "b": 3,
         "c": 4
     }, "spam"), ({
         "a": 5,
         "b": 1,
         "c": 1
     }, "ham"), ({
         "a": 1,
         "b": 4,
         "c": 3
     }, "spam")]
     classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
     test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}]
     ccm = classif.classify_many(test_data)
     self.assertEqual(ccm, ['ham', 'spam'])
예제 #3
0
 def test_bernollinb_returns_correct_result(self):
     train_data = [({
         "a": 4,
         "b": 1,
         "c": 0
     }, "ham"), ({
         "a": 5,
         "b": 2,
         "c": 1
     }, "ham"), ({
         "a": 0,
         "b": 3,
         "c": 4
     }, "spam"), ({
         "a": 5,
         "b": 1,
         "c": 1
     }, "ham"), ({
         "a": 1,
         "b": 4,
         "c": 3
     }, "spam")]
     classif = SklearnClassifier(BernoulliNB()).train(train_data)
     test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}]
     ccm = classif.classify_many(test_data)
     ['ham', 'spam']
     self.assertEqual(ccm, ['ham', 'spam'])
def LG_gender(train_set, test_set):
    print('== SkLearn MaxEnt ==')
    from nltk.classify import SklearnClassifier
    from sklearn.linear_model import LogisticRegression
    sklearn_classifier = SklearnClassifier(
        LogisticRegression(C=10e5)).train(train_set)
    print(sklearn_classifier.prob_classify(gender_features('mark'))._prob_dict)
    print(nltk.classify.accuracy(sklearn_classifier, test_set))
예제 #5
0
class MachineLearningNLP:
    def __init__(self, classifier_type='NaiveBayes', feats=word_feats):
        # "Thumbs up? Sentiment Classification using Machine Learning Techniques
        classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM']
        if classifier_type in classifier_list:
            self.classifier_type = classifier_type
        else:
            print("Classifier Type is not implemented: " + classifier_type)
        if self.classifier_type == 'MaximumEntropy':
            self.classifier = MaxentClassifier
        elif self.classifier_type == 'SVM':
            self.classifier = SklearnClassifier(LinearSVC(), sparse=False)
        elif self.classifier_type == 'NaiveBayes':
            self.classifier = NaiveBayesClassifier
        self.feats = feats

    def convert_txt(self, file_neg, file_pos):
        negfeats = list(map(self.feats, word_preprocess(file_neg)))
        posfeats = list(map(self.feats, word_preprocess(file_pos)))

        negfeats = list(zip(negfeats, ['neg'] * len(negfeats)))
        posfeats = list(zip(posfeats, ['pos'] * len(posfeats)))
        #        negfeats = [(self.feats(f), 'neg') for f in word_preprocess(file_neg)]
        #        posfeats = [(self.feats(f), 'pos') for f in word_preprocess(file_pos)]
        return (negfeats, posfeats)

    def train(self, train_data, **kwargs):
        self.classifier = self.classifier.train(train_data, **kwargs)

    def predict(self, test_data):
        return [self.classifier.classify(feats) for feats, label in test_data]

    def annotate(self, text):
        assert isinstance(text, str)
        text_Encoded = self.feats(text.split())
        return self.classifier.classify(text_Encoded)

    def performance(self, test_data):
        prediction = self.predict(test_data)
        pos_loc = set(
            [i for i in range(len(prediction)) if prediction[i] == 'pos'])
        neg_loc = set(range(len(prediction))) - pos_loc
        pos_ref = set(
            [i for i in range(len(prediction)) if test_data[i][1] == 'pos'])
        neg_ref = set(range(len(prediction))) - pos_ref
        print('===============================\n')
        print('Model Summary:\n')
        print(self.classifier_type + ' with features ' + self.feats.__name__ +
              '\n')
        print('Overall Accuracy: %.3f\n' %
              (nltk.classify.util.accuracy(self.classifier, test_data)))
        print('Positive Precision: %.3f\n' %
              (nltk.precision(pos_ref, pos_loc)))
        print('Positive Recall: %.3f\n' % (nltk.recall(pos_ref, pos_loc)))
        print('Negative Precision: %.3f\n' %
              (nltk.precision(neg_ref, neg_loc)))
        print('Negative Recall: %.3f\n' % (nltk.recall(neg_ref, neg_loc)))
예제 #6
0
class ModelGenerator(object):

    def __init__(self):
        self.pre_pro = TweetPreprocessor()
        self.classifier = SklearnClassifier(MultinomialNB(alpha=1.375))

        neg_twts = [(self.process_tweet(twt), "negative")
                    for twt in twitter_samples.strings('negative_tweets.json')]

        pos_twts = [(self.process_tweet(twt), "positive")
                    for twt in twitter_samples.strings('positive_tweets.json')]

        all_twts = neg_twts + pos_twts

        acc_scores, confusion_matrix = self.cross_validate(self.classifier, all_twts, 10)
        self.classifier.train(all_twts)
        print("Initialised classifier with an accuracy of {:.2f}%, +/- {:.2f}%"
              .format(mean(acc_scores) * 100, stdev(acc_scores) * 2 * 100))
        print("Confusion matrix: \n{}".format(confusion_matrix))

    def process_tweet(self, tweet):
        words = self.pre_pro.tokenise_tweet(tweet)
        words_wo_htgs = [self.pre_pro.strip_hash(word) for word in words]
        useful_words = [w for w in words_wo_htgs if self.pre_pro.is_useful_word(w)]

        stemmed_words = [self.pre_pro.stem(word) for word in useful_words]
        return self.pre_pro.create_word_features(stemmed_words)

    def persist(self):
        pickle.dump(self.classifier, open("model.p", "wb"))

    @staticmethod
    def cross_validate(algo, data, num_folds):
        acc_scores = []
        predicted_results = []
        actual_results = []

        for i in range(0, num_folds):
            train_data = copy(data)
            test_data = train_data[i::num_folds]
            # stratifies the data by picking out every nth element, with increasing offset
            del train_data[i::num_folds]
            # removes the test data from the training dataset

            trained_algo = algo.train(train_data)
            accuracy = nltk.classify.util.accuracy(trained_algo, test_data)
            acc_scores.append(accuracy)

            for td in test_data:
                predicted_results.append(trained_algo.classify(td[0]))
                actual_results.append(td[1])

        confusion_mat = nltk.ConfusionMatrix(actual_results, predicted_results)
        return acc_scores, confusion_mat
예제 #7
0
def searchNuSVC_classifier(title, train_departments):
    """
    Nu-Support Vector Classification.
    :param title:
    :param train_departments:
    :return:
    """
    classifier = SklearnClassifier(NuSVC())
    classifier.train(train_departments)
    test_sent_features = word_feats(title)
    return classifier.classify(test_sent_features)
예제 #8
0
def read(filename):
    fp = open(filename, "r")
    f = fp.readlines()
    vocab = [s.encode('utf-8').split() for s in f]
    #print vocab
    voc_vec = word2vec.Word2Vec(vocab, min_count=1, size=4)
    #print voc_vec.syn0.shape
    #print type(voc_vec['yav'])
    #Openning data file
    fp.close()
    fp = open("test_data.txt", "r")
    f = fp.read()
    tokens = nltk.word_tokenize(f)
    D = OrderedDict()
    sentences = []
    #print len(tokens)
    for word in tokens[0:200]:
        D[word.split("|")[0]] = word.split("|")[1]
        sentences.append(word.split("|")[0])
    #print D

    train_data = []

    for key in D:
        l = voc_vec[key]
        x = {}
        x['a'] = l[0]
        x['b'] = l[1]
        x['c'] = l[2]
        x['d'] = l[3]
        train_data.append((x, D[key]))
    classif = SklearnClassifier(BernoulliNB()).train(train_data)
    #print train_data

    test_data = []
    D2 = OrderedDict()
    for word in tokens[200:300]:
        D2[word.split("|")[0]] = word.split("|")[1]
    expected_list = []
    for key in D2:
        l = voc_vec[key]
        x = {}
        x['a'] = l[0]
        x['b'] = l[1]
        x['c'] = l[2]
        x['d'] = l[3]
        test_data.append(x)
        expected_list.append(D2[key])
    predicted = classif.classify_many(test_data)
    print len(predicted)
    print len(expected_list)
    print accuracy_score(expected_list, predicted, normalize=False)
예제 #9
0
def predict_nltk(in_text='', n=2): 
    ''' Text language classification
        Then use scikit-learn classifiers from within NLTK 
        to classify new taxt based on training set.
    '''
    trainingset = []  
    for label in text:
        featurs = text_features(text[label])
        trainingset.append((featurs, label))
    classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
    in_features = text_features(in_text, n=n)
    lang = classifier.classify(in_features)
    print 'Language:', lang
예제 #10
0
def predict_nltk(in_text='', n=2):
    ''' Text language classification
        Then use scikit-learn classifiers from within NLTK 
        to classify new taxt based on training set.
    '''
    trainingset = []
    for label in text:
        featurs = text_features(text[label])
        trainingset.append((featurs, label))
    classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
    in_features = text_features(in_text, n=n)
    lang = classifier.classify(in_features)
    print 'Language:', lang
예제 #11
0
class LinearSVC2Model(SKLearnModel):
    """This model classifies tweets into any one of twenty classes
    using SVM classification.
    """

    def __init__(self, balanced=False, C=1.0, dual=True, tol=1e-4, max_iter=1000, loss="squared_hinge") -> None:
        # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        self.tokenizer = TweetTokenizer(preserve_case=False,
                                        reduce_len=True,
                                        strip_handles=True).tokenize

        # set class_weight to None unless the 'balanced' has been set to true in the config
        class_weight = None  # type: Optional[str]
        if balanced:
            class_weight = "balanced"

        # Here we create the pipeline for the classifier.
        # The TfidfTransformer is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        # The LinearSVC sets up a Linear Support Vector Machine classifier. This is different because than using SCV
        # with a Linear kernel because it uses liblinear as a backend instead of libsvm. This makes it run a lot faster.
        pipeline = Pipeline([('tfidf', TfidfTransformer()),
                             ('linearsvc', LinearSVC(class_weight=class_weight, C=C,
                                                     dual=dual, tol=tol, max_iter=max_iter,
                                                     loss=loss))])
        self.classif = SklearnClassifier(pipeline)

    @staticmethod
    def get_extra_configs():
        configs = [{"name": "balanced", "default": False},
                   {"name": "C", "default": 1.0},
                   {"name": "dual", "default": True},
                   {"name": "tol", "default": 1e-4},
                   {"name": "max_iter", "default": 1000},
                   {"name": "loss", "default": "squared_hinge"}]  # add config for balanced.
        return super(LinearSVC2Model, LinearSVC2Model).get_extra_configs() + configs

    def train(self, tweets: List[Tweet]) -> None:
        def tweet_to_tuple(x):
            return (FreqDist(self.tokenizer(x.text)), x.emoji)

        # Generates tuples of all the tweets to form the corpus
        corpus = map(tweet_to_tuple, tweets)

        # Train this model!
        self.classif.train(corpus)

    def predict(self, text):
        return self.classif.classify(FreqDist(self.tokenizer(text)))
예제 #12
0
 def __init__(self, classifier_type='NaiveBayes', feats=word_feats):
     # "Thumbs up? Sentiment Classification using Machine Learning Techniques
     classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM']
     if classifier_type in classifier_list:
         self.classifier_type = classifier_type
     else:
         print("Classifier Type is not implemented: " + classifier_type)
     if self.classifier_type == 'MaximumEntropy':
         self.classifier = MaxentClassifier
     elif self.classifier_type == 'SVM':
         self.classifier = SklearnClassifier(LinearSVC(), sparse=False)
     elif self.classifier_type == 'NaiveBayes':
         self.classifier = NaiveBayesClassifier
     self.feats = feats
예제 #13
0
    def __init__(self, load_clf=False, load_tr_data=False):
        self.features = self.__load_support_vector_features()
        self.training_data = []
        self.n_samples = 0
        self.all_tweets = self.__load_tweets_from_file()  # list not dict

        # Classifier loading
        if load_clf:
            self.load_clf()
        else:
            self.clf = SklearnClassifier(SVC(), sparse=False)

        # Training Data loading
        if load_tr_data:
            self.__load_training_data()
예제 #14
0
def run_program(is_testing, mode):
    """########## CHECKING WHAT THE PROGRAM IS GOING TO EXECUTE ##########"""
    print(" ")
    print(print_vals(is_testing, mode))
    """###################################################################"""

    iteration = 0

    file_path = ''
    if is_testing:
        file_path = 'Data/datasets/test.csv'
    else:
        file_path = 'Data/datasets/training.csv'

    load_csv(file_path, mode)
    features = feature_choices()
    number_of_labels = int(len(labels))
    iteration = 0
    weighted_data = select_features(features)
    print("Training Classifier: ")
    classifier = SklearnClassifier(
        LinearSVC(loss='squared_hinge', max_iter=999999)).train(weighted_data)

    # make_predictions()
    return None
예제 #15
0
def trainClassifier(trainData):
    
    pipeline = Pipeline([('svc', LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))])
    return SklearnClassifier(pipeline).train(trainData)
예제 #16
0
    def __init__(self, kernel: str = "") -> None:
        # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        self.tokenizer = TweetTokenizer(preserve_case=False,
                                        reduce_len=True,
                                        strip_handles=True).tokenize

        # Here we create the pipeline for the classifier.
        # The TfidfTransformer is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        # The SVC sets up a Support Vector Machine classifier with the configured kernel.
        # In this case it is either a linear or a radial basis function kernel.
        # The details for the above items are discussed in the model's readme.
        pipeline = Pipeline([('tfidf', TfidfTransformer()),
                             ('{}svc'.format(kernel), SVC(kernel=kernel))])
        self.classif = SklearnClassifier(pipeline)
예제 #17
0
def m_train():
    train = []
    with codecs.open('data/train_chunked_double.data',
                     mode='r',
                     encoding='UTF-8') as file:
        for line in file.readlines():
            line = line.strip('\n')
            line = line.strip('\r')
            pair = line.split(',')

            e = pair[0]
            z = pair[1]

            for j in range(len(z)):
                x = gen_x(e, z, j)
                y = z[j]
                train.append((x, y))

    try:
        clas = SklearnClassifier(
            LogisticRegression(solver='lbfgs', n_jobs=-1,
                               max_iter=200)).train(train)
        save_model(clas)
        return clas

    except Exception as e:
        print('Error: %r' % e)
        return None
def train_and_save_model(data_set_name="NB_Model_Tatoeba_", n=2):
    trainingset = []
    for i, label in enumerate(targets):
        featurs = text_features(data[i], n)
        trainingset.append((featurs, label))
    classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
    save(data_set_name + str(n) + "n", classifier)
    return classifier
예제 #19
0
    def __init__(self):
        self.pre_pro = TweetPreprocessor()
        self.classifier = SklearnClassifier(MultinomialNB(alpha=1.375))

        neg_twts = [(self.process_tweet(twt), "negative")
                    for twt in twitter_samples.strings('negative_tweets.json')]

        pos_twts = [(self.process_tweet(twt), "positive")
                    for twt in twitter_samples.strings('positive_tweets.json')]

        all_twts = neg_twts + pos_twts

        acc_scores, confusion_matrix = self.cross_validate(self.classifier, all_twts, 10)
        self.classifier.train(all_twts)
        print("Initialised classifier with an accuracy of {:.2f}%, +/- {:.2f}%"
              .format(mean(acc_scores) * 100, stdev(acc_scores) * 2 * 100))
        print("Confusion matrix: \n{}".format(confusion_matrix))
예제 #20
0
 def train_using_SklearnClassifier(self, training_data, test_data):
     #   Giving bad results. Don't use.
     classifier = SklearnClassifier(BernoulliNB()).train(training_data)
     classifier2 = SklearnClassifier(SVC(),
                                     sparse=False).train(training_data)
     print(classifier)
     classifier_name = type(classifier).__name__
     training_set_accuracy = nltk.classify.accuracy(classifier,
                                                    training_data)
     training_set_accuracy2 = nltk.classify.accuracy(
         classifier2, training_data)
     test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
     test_set_accuracy2 = nltk.classify.accuracy(classifier2, test_data)
     print(">>>>>>>>")
     print(training_set_accuracy, test_set_accuracy)
     print(training_set_accuracy2, test_set_accuracy2)
     return classifier, classifier_name, test_set_accuracy, training_set_accuracy
예제 #21
0
    def _train(self):
        pickle_filename = "{0}.pickle".format(self.__class__.__name__)
        if os.path.isfile(pickle_filename):
            with open(pickle_filename, "rb") as classifier_f:
                self._classifier = pickle.load(classifier_f)
            classifier_f.close()
        else:
            train_set = [(self._extract_features(cascade), cascade['label'])
                         for cascade in self._dataset]
            gbc_clf = GradientBoostingClassifier(n_estimators=1000)
            self._classifier = SklearnClassifier(gbc_clf,
                                                 sparse=False).train(train_set)

            with open(pickle_filename, "wb") as save_classifier:
                pickle.dump(self._classifier, save_classifier)

            save_classifier.close()
예제 #22
0
    def _train(self):
        pickle_filename = "{0}.pickle".format(self.__class__.__name__)
        if os.path.isfile(pickle_filename):
            with open(pickle_filename, "rb") as classifier_f:
                self._classifier = pickle.load(classifier_f)
            classifier_f.close()
        else:
            train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset]
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('rf', SVC(kernel='linear', probability=True))])
            self._classifier = SklearnClassifier(pipeline, sparse=False).train(train_set)

            with open(pickle_filename, "wb") as save_classifier:
                pickle.dump(self._classifier, save_classifier)

            save_classifier.close()
예제 #23
0
def main():
    """Main."""
    from sklearn.svm import SVC
    from nltk.classify import SklearnClassifier

    classifier = SklearnClassifier(SVC(kernel="rbf"), sparse=False)

    _train(classifier)
    _test(classifier)
예제 #24
0
  def leaveKOutValidation(k=1):
    accuracy = 0.0
    print("Performing leave-"+str(k)+"-out cross-validation")
    gamesClusters = [feats[int(i*k):int((i+1)*k)] for i in range(int(len(feats)/k))]
    for games in gamesClusters:
      training = [x for x in feats if x not in games]

      pipeline = Pipeline([('tfidf', TfidfTransformer()),
        #('chi2', SelectKBest(chi2, k=250)),  
        ('nb', MultinomialNB())])

      classifier = SklearnClassifier(pipeline).train(training)

      rw = []
      for game in games:
        classification = classifier.classify(game[0])
        accuracy += int((game[1] > 0) == (classification > 0)) / float(len(feats))
    print("With leave-"+str(k)+"-out cross-validation, the algorithm is "+str(round(accuracy*100,4))+"% accurate")
예제 #25
0
def LG_gender(train_set):
    print('== SkLearn MaxEnt ==')

    from nltk.classify import SklearnClassifier
    from sklearn.linear_model import LogisticRegression

    sklearn_classifier = SklearnClassifier(
        LogisticRegression(C=10e5)).train(train_set)
    return sklearn_classifier
def evaluate_classifier(featx,collocationFunc):
    #negFiles = movie_reviews.fileids('neg')
    #posFiles = movie_reviews.fileids('pos')
    #negWordsList=[movie_reviews.words(fileids=[f]) for f in negFiles]
    #posWordsList=[movie_reviews.words(fileids=[f]) for f in posFiles]
    #negfeats = [(featx(negWords), 'neg') for negWords in negWordsList]
    #posfeats = [(featx(posWords), 'pos') for posWords in posWordsList]

    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids]

    #lenNegFeats=min(len(negfeats),24)
    #lenPosFeats=min(len(posfeats),24)
    lenNegFeats=len(negfeats)
    lenPosFeats=len(posfeats)
    negcutoff = int(lenNegFeats*3/4)
    poscutoff = int(lenPosFeats*3/4)
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:lenNegFeats] + posfeats[poscutoff:lenPosFeats]
 
    #classifier = MaxentClassifier.train(trainfeats)
    classifier = SklearnClassifier(BernoulliNB()).train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
    evaluationMetrics={}
    print(classifier)
    evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats)
    evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos'])
    evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos'])
    evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos'])
    evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg'])
    evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg'])
    evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg'])
    return evaluationMetrics
예제 #27
0
def bag_of_words_model(df, column_name, target='label', k=1000):
    """
    """
    pos_array = df[(df[target] == 1)][column_name].values
    neg_array = df[(df[target] == 0)][column_name].values

    pipeline = Pipeline([('tfidf', TfidfTransformer()),
                         ('chi2', SelectKBest(chi2, k=k)),
                         ('nb', MultinomialNB())])
    clf = SklearnClassifier(pipeline)

    pos = [FreqDist(word_list) for word_list in pos_array]
    neg = [FreqDist(word_list) for word_list in neg_array]

    add_label = lambda lst, lab: [(x, lab) for x in lst]

    trained_clf = clf.train(add_label(pos, 1) + add_label(neg, 0))

    return trained_clf
예제 #28
0
def bag_of_words_model(df, column_name, target='label', k=1000):
    """
    """
    pos_array = df[(df[target] == 1)][column_name].values
    neg_array = df[(df[target] == 0)][column_name].values

    pipeline = Pipeline([('tfidf', TfidfTransformer()),
                         ('chi2', SelectKBest(chi2, k=k)),
                         ('nb', MultinomialNB())])
    clf = SklearnClassifier(pipeline)

    pos = [FreqDist(word_list) for word_list in pos_array]
    neg = [FreqDist(word_list) for word_list in neg_array]

    add_label = lambda lst, lab: [(x, lab) for x in lst]

    trained_clf = clf.train(add_label(pos, 1) + add_label(neg, 0))

    return trained_clf
    def trainClassifier(self):
        self.initPipeline()
        # Create the multinomial NB classifier
        self.classifier = SklearnClassifier(self.pipeline)
        # Train the classifier
        self.classifier.train(self.trainingSet)
        # End func return
        return
    # End trainClassifier override
# End sub class
예제 #30
0
파일: test3.py 프로젝트: jshenaop/soleka
def evaluate_classifier(featx):
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = len(negfeats) * 3 / 4
    poscutoff = len(posfeats) * 3 / 4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                sparse=True,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            print(trainfeats)
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
예제 #31
0
	def train( self, observations ,  k=5 ):
		'''
		An ensamble K-Fold Classifier 
		'''
		self.forest = []
		splitdata = np.array_split(observations, k)
		combos = list(reversed(list(itertools.combinations(splitdata, k-1))))
		accuracy_sum = 0
		for i in range(k):
			train = list(itertools.chain(*combos[i]))
			test = splitdata[i]
			if k==1:
				train = observations
				test = observations
			c = SklearnClassifier(RandomForestClassifier())
			#c = SklearnClassifier(cls)	
			c.train(train)
			accuracy_sum += nltk.classify.accuracy(c,test)
			self.forest.append(c)

		print('Accuracy on Train data(Using K fold)= ', accuracy_sum/k )
예제 #32
0
파일: classify.py 프로젝트: ksuhr1/CMPS143
def train_scikit_model(best_features, feature_set, split_name,
                       classifier_name):

    #train on the training data of word_features

    #find which classifier model to use
    if classifier_name == "nb":
        cls = nltk.classify.NaiveBayesClassifier.train(best_features)
    elif classifier_name == "nb_sk":
        cls = SklearnClassifier(BernoulliNB()).train(best_features)
    elif classifier_name == "dt":
        cls = nltk.classify.DecisionTreeClassifier.train(best_features)
    elif classifier_name == "dt_sk":
        cls = SklearnClassifier(
            tree.DecisionTreeClassifier()).train(best_features)
    elif classifier_name == "svm_sk" or classifier_name == "svm":
        cls = SklearnClassifier(svm.SVC())
    else:
        assert False, "unknown classifier name:{}; known names: nb, dt, svm, nb_sk, dt_sk, svm_sk".format(
            classifier_name)
    return cls
예제 #33
0
    def train(self, observations, k=5):
        '''
		An ensamble K-Fold Classifier 
		'''
        self.forest = []
        splitdata = np.array_split(observations, k)
        combos = list(reversed(list(itertools.combinations(splitdata, k - 1))))
        accuracy_sum = 0
        for i in range(k):
            train = list(itertools.chain(*combos[i]))
            test = splitdata[i]
            if k == 1:
                train = observations
                test = observations
            c = SklearnClassifier(RandomForestClassifier())
            #c = SklearnClassifier(cls)
            c.train(train)
            accuracy_sum += nltk.classify.accuracy(c, test)
            self.forest.append(c)

        print('Accuracy on Train data(Using K fold)= ', accuracy_sum / k)
예제 #34
0
def searchSGDClassifier_classifier(title, train_departments):
    """

    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    classifier = SklearnClassifier(SGDClassifier(loss='log'))
    classifier.train(train_departments)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    probability = classifier.prob_classify(test_sent_features)
    print(probability.prob(found_department))

    return [
        found_department,
        probability.prob(found_department),
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
예제 #35
0
def searchLinearSVC(title, train_departments):
    """
    Linear SVC
    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    #classifier = SklearnClassifier(LinearSVC(probability=True))
    classifier = SklearnClassifier(SVC(kernel='linear', probability=True))
    classifier.train(train_departments)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    probability = classifier.prob_classify(test_sent_features)
    print(probability.prob(found_department))

    return [
        found_department,
        probability.prob(found_department),
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
예제 #36
0
 def ml_sentiment(self, text):
     ''' Machine Learning for Sentiment detection.
     '''
     trainingset = []
     for tweet in self.data:
         trainingset.append(self.sentiment_featrues(tweet))
     #classifier = nltk.NaiveBayesClassifier.train(trainingset)
     #classifier = nltk.DecisionTreeClassifier.train(trainingset)
     classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
     tokenz = self.ml_tag(text, print_tags=False)
     tweet = {
         'tokens': tokenz,
         'sentiment': ''
     } 
     tokenz_features = self.sentiment_featrues(tweet)
     #print tokenz_features
     sentiment = classifier.classify(tokenz_features[0])
     #print text, sentiment
     tweet['sentiment'] = sentiment
     print '\nTweet:', text
     self.show_tweet(tweet)
     return sentiment
class SentimentMNB(SentimentClassifier):
    # Sub class constructor
    def __init__(self, chiK=3368):
        # Call the super class constructor which initializes the classifier
        self.chiK = chiK
        super(SentimentMNB, self).__init__()
        # End func return
        return
    # End wrapper class constructor
    
    # Function to initialize the classifier pipeline
    def initPipeline(self):
        # Pipeline of transformers with a final estimator
        # The pipeline class behaves like a compound classifier
        # pipeline(steps=[...])

        # Old MNB pipeline with TFIDF
        # self.pipeline = Pipeline([('tfidf', TfidfTransformer()),
        #              ('chi2', SelectKBest(chi2, k=1000)),
        #              ('nb', MultinomialNB())])

        self.pipeline = Pipeline([('chi2', SelectKBest(chi2, k=self.chiK)),
                      ('nb', MultinomialNB())])
        # End func return
        return
    # End initPipeline
        
    # Overriding func to train multinomial NB classifier
    def trainClassifier(self):
        self.initPipeline()
        # Create the multinomial NB classifier
        self.classifier = SklearnClassifier(self.pipeline)
        # Train the classifier
        self.classifier.train(self.trainingSet)
        # End func return
        return
    # End trainClassifier override
# End sub class
예제 #38
0
##0-Suffix,1-Previous Number,2-Next Number ,3-Previous wordform,4-next wordform, 5-post position,6-present word form,7-POS####

from preprocess_train import features,number;
from preprocess_test import features_test,number_test;
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
train_data=[[x for x in range(2)] for y in range(357)]
test_data=[[x for x in range(1)] for y in range(11)]
for i in range(0,357):
    train_data[i][0]={'Suffix':features[i][0], 'Previous morph':features[i][1],'Next morph':features[i][2],'Previous wordform':features[i][3],
                      'Next wordform':features[i][4],'postposition':features[i][5],'wordform':features[i][6],'pos':features[i][7]}
    train_data[i][1]=number[i]
for i in range(0,11):
    test_data[i]={'Suffix':features_test[i][0], 'Previous morph':features_test[i][1],'Next morph':features_test[i][2],'Previous wordform':features_test[i][3],
                  'Next wordform':features_test[i][4],'postposition':features_test[i][5],'wordform':features[i][6],'pos':features[i][7]} 
classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
result=classif.classify_many(test_data)
classif1 = SklearnClassifier(BernoulliNB()).train(train_data)
result1=classif1.classify_many(test_data)
print result1
예제 #39
0
#Determine training, test and dev sets
size = int(round((len(rawData) * 0.15), 0))
random.shuffle(rawData)
testData = rawData[:size]
trainData = rawData[size:]
random.shuffle(trainData)

# Generate TermFrequency for each doc
trainTF = [(FreqDist(tokenize(text)), tag) for text, tag in trainData]
testTF = [(FreqDist(tokenize(text)), tag) for text, tag in testData]

# Create classifier
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=1000)),
                     ('nb', MultinomialNB())])
classif = SklearnClassifier(pipeline)
# Train classifier
classif.train(trainTF)


# Evaluate
testTags = [tag for tf, tag in testTF]
testResults = classif.batch_classify([tf for tf, tag in testTF])

right = 0
for i, tg in enumerate(testTags):
    if testResults[i] == tg:
        right += 1

print 'Results: ------------------------------------'
print testResults
예제 #40
0
def evaluate_bow():
    lines = codecs.open(BC3_LABELLED_FILE, "r").readlines()

    data = []
    gold = []

    for i, line in enumerate(lines):
        tokens = line.strip().split()

        if len(tokens) > 2:
            label = tokens.pop(0)
            tag = tokens.pop(0)

            if tag == "none":
                continue

            if i < len(lines) and len(lines[i + 1].strip().split()) > 2:
                lines[i + 1].strip().split().pop(0)
                next_label = lines[i + 1].strip().split().pop(0)
            else:
                next_label = "T"

            gold.append(tag)
            data.append((FreqDist(tokens), tag, next_label))

    limit = int(float(len(data)) * 0.8)

    # training set: bags-of-words and tag tuples
    train = [(bow, tag) for bow, tag, next_label in data[:limit]]
    # training the classifier
    classifier = SklearnClassifier(MultinomialNB()).train(train)

    results = {
        "segmented": [],
        "unsegmented": []
    }

    all_choices = [] # all choices made
    choices = [] # choices for the current segment
    nb = 1 # number of lines in the segment
   
    for i, (bow, tag, next_label) in enumerate(data[limit:]):
        # bow classification
        choice = classifier.classify(bow)
        choices.append(choice)
        all_choices.append(choice)

        # line by line classification for unsegmented results
        results["unsegmented"].append(choice)

        # more complex classification for segmented results
        if next_label == "T":
            most_common = Counter(choices).most_common()

            if len(most_common) > 1:
                tf = FreqDist(all_choices)
                vote = most_common[0]
                best = 1

                for candidate, occ in most_common:
                    if tf[candidate] > best:
                        vote = candidate
                        best = tf[candidate]
            else:
                vote, occ = most_common[0]

            results["segmented"] += [vote for choice in choices]
            choices = []
            nb = 1
        else:
            nb += 1 # incrementing the current number of lines in the bag
  
    for i, label in enumerate(gold[limit:]):
        bow, tag, next_label = data[i + limit]
        print("# {0}\t{1}\t{2}".format(label, results["unsegmented"][i], results["segmented"][i]))
        if next_label == "T":
            print("# ------------------")

    # segmented metrics
    sp = metrics.precision_score(gold[limit:], results["segmented"])
    sr = metrics.recall_score(gold[limit:], results["segmented"])
    sf = (2.0 * (sr * sp)) / (sr + sp)

    # unsegmented metrics
    up = metrics.precision_score(gold[limit:], results["unsegmented"])
    ur = metrics.recall_score(gold[limit:], results["unsegmented"])
    uf = (2.0 * (ur * up)) / (ur + up)

    print("#")
    print("#                Pre.:\t\tRec:\t\tF1:")
    print("# segmented:     {0}%\t\t{1}%\t\t{2}%".format(dec(sp * 100), dec(sr * 100), dec(sf * 100)))
    print("# non-segmented: {0}%\t\t{1}%\t\t{2}%".format(dec(up * 100), dec(ur * 100), dec(uf * 100)))
예제 #41
0
#   academic institution
#
import csv
import numpy as np
from nltk.probability import FreqDist
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = [('tfidf', TfidfTransformer()),
            ('chi2', SelectKBest(chi2, k=20)),
            ('nb', MultinomialNB())]
classif = SklearnClassifier(Pipeline(pipeline))

# just break on gaps -- note that this doesn't filter out punctuation
tokenizer = RegexpTokenizer('[\w\d]+')

training_set = []
with open('train_jlm.csv', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        if row[0] != 'OrganisationId': # header
            words = tokenizer.tokenize(row[1])
            if row[4] == 'Academic':
                training_set.append((words, 'academic'))
            else:
                training_set.append((words, 'private'))
    """
	Linear (Bernoulli) SVC
	Implementation of Support Vector Machine classifier using libsvm: 
	the kernel can be non-linear but its SMO algorithm does not scale to
	 large number of samples as LinearSVC does.
	"""

    from nltk.classify import SklearnClassifier
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC

    print " "
    print "============================="
    print "Bernoulli SVC Classifier:"
    classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
    classifierBi.classify_many(test)

    for pdist in classifierBi.prob_classify_many(test):
        print pdist.prob("human"), pdist.prob("auto")

    for i in range(len(classifierBi.classify_many(test))):
        print classifierBi.classify_many(test)[i]

    classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set)
    classifierSVC.classify_many(test)

    # svc = nltk.classify.accuracy(classifierSVC, test_set)
    # print 'accuracy is %.2f' %round(svc*100,4), '%'
    def SVC():
        classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
예제 #43
0
    #region SVMClassifier
    WriteLog("\nEntering SVM", ClassificationLogFile)
    trainD = list()
    testD = list()
    gTruth = list()

    #Formatting the Data
    for dictPair in training_set:
        trainD.append(dictPair)
    for dictPair in testing_set:
        testD.append(dictPair[0])
        gTruth.append(dictPair[1])

    WriteLog("Starting SVM Training", ClassificationLogFile)
    SVMClassifier = SklearnClassifier(SVC(), sparse=False).train(trainD)
    SVMPredictions = SVMClassifier.classify_many(testD)

    WriteLog("SVM Training Set Accuracy:", ClassificationLogFile)
    WriteLog(str(accuracy_score(gTruth, SVMPredictions, normalize=True, sample_weight=None)), ClassificationLogFile)

    #SVM Classification
    WriteLog("SVM Classification", ClassificationLogFile)
    DoClassify(SVMClassifier, SVMtopicResultsTxt, topicTweetsLDATxt)

    #SVM Predictions
    WriteLog("SVM Predictions:", ClassificationLogFile)
    WriteLog(SVMPredictions, ClassificationLogFile)
    #endregion

    #region NaiveBayes
예제 #44
0
파일: classify.py 프로젝트: scuellar/COS424
import numpy as np
from nltk.probability import FreqDist
from nltk.classify import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#pipeline = Pipeline([('tfidf', TfidfTransformer()),
#                     ('chi2', SelectKBest(chi2, k=1000)),
#                     ('nb', MultinomialNB())])
#classif = SklearnClassifier(pipeline)
classif = SklearnClassifier(MultinomialNB())

add_label = lambda lst, lab: [(x, lab) for x in lst]

import justTry

all_w, per = justTry.getWords(0)

print len(per[0]), len(per[1]), len(per[2]), len(per[3]), len(per[4]), 

train1 = (9*len(per[0]))/10
train2 = (9*len(per[1]))/10
train3 = (9*len(per[2]))/10
train4 = (9*len(per[3]))/10
train5 = (9*len(per[4]))/10

ones = [FreqDist(x) for x in per[0]]
twos = [FreqDist(x) for x in per[1]]
threes = [FreqDist(x) for x in per[2]]
예제 #45
0
from awesome_print import ap 
from nltk import NaiveBayesClassifier
from nltk.util import ngrams
from nltk.metrics import scores

from nltk.classify import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef

pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=1000)),
                     ('nb', MultinomialNB())])
classif = SklearnClassifier(pipeline)

def extract_featurelabel(student):
	features = {}
	bigrams = ngrams(student['Student Comment'],2)
	for word in student['Student Comment']:
		features['contains (%s)'%word] = word
	for bigram in bigrams:
		features['contains (%s)'%(' '.join(bigram))] = ' '.join(bigram)

	return (features,student['Physician Comment'])
	#return (features,find_student_grade(student['Name']))

def find_student_grade(name): 
	#Assumes that a dictionary with each student's name and grade has been created
	#Only look for last name because the first name was not recorded for all students
 def NBtfidf():
     classifierTF = SklearnClassifier(pipeline).train(train_set)
     return classifierTF.classify_many(test)
 def LinSVC():
     classifierLinSVC = SklearnClassifier(LinearSVC(), sparse=False).train(train_set)
     return classifierLinSVC.classify_many(test)
예제 #48
0




tweets = []
stop_words = set(stopwords.words('english'))

for (words, sentiment) in train:
	words_filtered = [e.lower() for e in words.split() if e not in stop_words]
	tweets.append((words_filtered, sentiment))

# print tweets
# word_features = get_word_features(get_words_in_tweets(tweets))
# training_set = nltk.classify.apply_features(extract_features, tweets)

training_set=traindict(tweets)
print training_set

# classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier =  SklearnClassifier(SVC(), sparse=False).train(training_set)

tweetd = 'I have cows :('
print classifier.classify(dict(Counter(clean(tweetd.lower()))))



# tweetd = 'Obama is boring :('
# print classifier.classify(extract_features(tweetd.lower().split()))
예제 #49
0
def evaluate_classifier(featx):
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    
    print 'Reading Tweets\n'
    tweets_data_path = '20161019_202620.txt'
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
	    try:
	        tweet = json.loads(line)
	        tweets_data.append(tweet)
	    except:
	    	continue
	     	
    tweets = pd.DataFrame()
    tweets['text'] = [tweet.get('text','') for tweet in tweets_data]
    
    tdata = tweets['text']
    negfeats = [(featx(f), 'neg') for f in word_split(tdata)]
    testfeats = negfeats

    print np.shape(testfeats)
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    #print np.shape(testfeats)
    
    
    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']     
        
    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None,  gaussian_prior_sigma=0, max_iter = 1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)
            
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

        print testsets[observed]

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        #pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
        #pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
        #pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        #neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
        #neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
        #neg_fmeasure =  nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
        
        print ''
        print '---------------------------------------'
        print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
        print '---------------------------------------'
        print 'accuracy:', accuracy
예제 #50
0
import numpy
import scipy

from nltk.classify import maxent
nltk.classify.MaxentClassifier.ALGORITHMS
# ['GIS','IIS','CG','BFGS','Powell','LBFGSB','Nelder-Mead','MEGAM','TADM']

# MEGAM or TADM are not rec'd for text classification
mec = nltk.classify.MaxentClassifier.train(train_features, 'GIS', trace=0, max_iter=1000)

from sklearn import cross_validation
cv = cross_validation.KFold(len(train_features), n_folds=10, indices=True, shuffle=False, random_state=None)

for traincv, evalcv in cv:
    classifier = nltk.NaiveBayesClassifier.train(train_features[traincv[0]:traincv[len(traincv)-1]])
    print 'accuracy: %.3f' % nltk.classify.util.accuracy(classifier, train_features[evalcv[0]:evalcv[len(evalcv)-1]])



import sklearn
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=2000)),
                     ('nb', MultinomialNB())])
pipecl = SklearnClassifier(pipeline)
pipecl.train(train_features)
예제 #51
0
def find_feature(document):
    words = set(document)
    feature = {}
    for w in words_feature:
        feature[w] = (w is words)

    return feature


features = [(find_feature(rev), category) for (rev, category) in documents]

testing_set = features[1900:]
training_set = features[:1900]

if not os.path.isfile(naivebayes):
    classifier = nltk.NaiveBayesClassifier.train(training_set)

    save_classifier = open(naivebayes, "wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()
else:
    classifier_f = open(naivebayes, "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()

print("Original Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100))

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("Multinomial Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100))
예제 #52
0
pairs = [(classifier.classify(example), actual)
            for (example, actual) in test_set]

do_evaluation (pairs)
do_evaluation (pairs, pos_cls='neg')

#%% Other classifier : SVM ###################################################

# http://www.nltk.org/howto/classify.html
# Run example

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC

t0 = time.time()
classif = SklearnClassifier(SVC(), sparse=False).train(train_set)
print(round(time.time()-t0,2))

classif.classify_many(test_set[0][0])

sizeTrain = [800]  # the first 100, the first 300 ,etc
testDoc = [800, 1000] # 800 to 999

classif.classify_many(test_set[0][0])

#%% SVM Class ################################################################

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
    
class SVM:
예제 #53
0
#neucutoff = len(neufeats)*4/5
length = 4
cutoff0 = len(feats0)*length/5
cutoff1 = len(feats1)*length/5
cutoff2 = len(feats2)*length/5
cutoff3 = len(feats3)*length/5
cutoff4 = len(feats4)*length/5

trainfeats = feats0[:cutoff0] + feats1[:cutoff1] + feats2[:cutoff2] + feats3[:cutoff3] + feats4[:cutoff4] 
testfeats = feats0[cutoff0:] + feats1[cutoff1:] + feats2[cutoff2:] + feats3[cutoff3:] + feats4[cutoff4:]

print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
 
#classifier = NaiveBayesClassifier.train(trainfeats)
#classifier = nltk.classify.DecisionTreeClassifier.train(trainfeats)
classifier = SklearnClassifier(BernoulliNB()).train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
#classifier.show_most_informative_features()

results = classifier.batch_classify([fs for (fs,l) in testfeats])

count = 0
'''
with open(loc_submission, "wb") as outfile:
    outfile.write("PhraseID,Sentiment\n")
    for val in results:
      outfile.write("%s,%s\n"%(df_test['PhraseId'][count],val))
      count += 1
'''

def evaluate_features(feature_select):
    posFeatures = []
    negFeatures = []
    inposFeatures = []
    innegFeatures = []
	#http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
	#breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)
    """
    with open(RT_INPUT_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            inposWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            inposWords = [feature_select(inposWords), 'pos']
            inposFeatures.append(inposWords)
    """
    with open(RT_INPUT_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            innegWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            innegWords = [feature_select(innegWords), 'neg']
            innegFeatures.append(innegWords)
   
	#selects 3/4 of the features to be used for training and 1/4 to be used for testing
	#posCutoff = int(math.floor(len(posFeatures)*3/4))
	#negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures + negFeatures
    testFeatures = innegFeatures #+ inposFeatures
      
    	#trains a Naive Bayes Classifier
    classifier = SklearnClassifier(BernoulliNB()).train(trainFeatures)	
    #classifier = SklearnClassifier(SVC(probability=True), sparse=False).train(trainFeatures)	
    
    	#initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	

    fileOutput ={'key':[],'pos':[],'neg':[]}
	#puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        #print features , label
        referenceSets[label].add(i)
        predicted = classifier.prob_classify_many(features)
        print "\n" 
        #print predicted
        for item in predicted:
            fileOutput['key'].append(i)
            fileOutput['pos'].append(item.prob("pos"))
            fileOutput['neg'].append(item.prob("neg"))
        #posValues =  predicted.prob("pos") 
        #negValues = predicted.prob("neg") 
        fileOutput.values()
        #testSets[predicted].add(i)
        #print i
        #print testSets[predicted]
    return fileOutput
예제 #55
0
 def train(self, features_label):
     svm = SklearnClassifier(SVC(C=1000.0, gamma=0.0001))
     self._classifier = svm.train(features_label)
     return None