def train(self):
        # with open('pos_tweet.csv', encoding="utf8", mode='r') as pos_tweet:
        with open('datasets/pos_tweet.csv', mode='r') as pos_tweet:
            pos = csv.DictReader(pos_tweet, delimiter=',')
            for ptweet in pos:
                self.pos_tweets.append(
                    (bag_of_words(ptweet['tweet']), 'positive'))

        # with open('neg_tweet.csv', encoding="utf8", mode='r') as neg_tweet:
        with open('datasets/neg_tweet.csv', mode='r') as neg_tweet:
            neg = csv.DictReader(neg_tweet, delimiter=',')
            for ntweet in neg:
                self.neg_tweets.append(
                    (bag_of_words(ntweet['tweet']), 'negative'))

        # with open('neg_tweet.csv', encoding="utf8", mode='r') as neu_tweet:
        # with open('neg_tweet.csv', mode='r') as neu_tweet:
        #     neu = csv.DictReader(neu_tweet, delimiter=',')
        #     for neutweet in neu:
        #         self.neu_tweets.append((bag_of_words(neutweet['tweet']), 'neutral'))

        shuffle(self.pos_tweets)
        shuffle(self.neg_tweets)
        #shuffle(self.neu_tweets)
        self.all_train_set = self.pos_tweets + self.neg_tweets  # + self.neu_tweets
        trained = NaiveBayesClassifier.train(self.all_train_set)
        return trained
示例#2
0
def main(argv):
    f = False
    fs = ''
    ds = ''
    sample_size = 0
    method = None
    try:
        opts, args = getopt.getopt(argv, "hf", ["sample=", "fs=", "ds="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt == '--ds':
            ds = arg
        elif opt == '--sample':
            sample_size = int(arg)
        elif opt == '--fs':
            fs = arg
        elif opt == '-f':
            f = True

    if ds == '1' and not f:
        dsb.generate_concat(sample_size)
    elif ds == '2' and not f:
        dsb.generate_one_question_per_line(sample_size)

    ds = 1 if ds == '' else ds
    if fs == '1':
        fe.bag_of_words(int(ds))
    elif ds and fs == '2':
        fe.word2vec(int(ds))
示例#3
0
    def test_bag_of_words(self):
        test_corpus = [
            'the sky is blue.', 'sky is blue and sky is beautiful.',
            'the beautiful sky is so blue.', 'i love blue cheese.'
        ]

        features = [
            Counter({
                'blue': 1,
                'sky': 1
            }),
            Counter({
                'sky': 2,
                'blue': 1,
                'beautiful': 1
            }),
            Counter({
                'beautiful': 1,
                'blue': 1,
                'sky': 1
            }),
            Counter({'blue': 1})
        ]
        feature_names = set(['blue', 'beautiful', 'sky'])

        self.assertEqual(bag_of_words(test_corpus, num_feats=3),
                         (features, feature_names))
示例#4
0
文件: main.py 项目: pitpalac36/AI
def text_classification(mode):
    words_inputs, word_outputs = read_from_csv('reviews_mixed.csv')
    vocabulary = tokenize_sentences(words_inputs)
    bag = []
    for each in words_inputs:
        bag.append(bag_of_words(each, vocabulary))
    bag_train_input, bag_train_output, bag_test_input, bag_test_output = split_data(bag, word_outputs)

    if mode == 'kmeans':
        words_km = kmeans(k=2, similarity=jaccard_similarity, max_iterations=50)
        words_km.fit(bag_train_input)

        print(len(words_km.clusters[0]))
        print(len(words_km.clusters[1]))
        dictionary = {'first cluster' : [], 'second cluster' : []}
        for i in range(len(bag_test_input)):
            if words_km.centroids[0] == words_km.predictJaccard(bag_test_input[i]):
                dictionary['first cluster'].append(bag_test_output[i])
            else:
                dictionary['second cluster'].append(bag_test_output[i])
        print('FIRST CLUSTER :')
        for each in dictionary['first cluster']:
            print(each)
        print('SECOND CLUSTER :')
        for each in dictionary['second cluster']:
            print(each)
        print('Dunn index : ' + str(words_km.dunn_index()))


    if mode == 'log':
        classifier = MyLogisticRegression()
        numeric_output = [0 if bag_train_output[i] == 'negative' else 1 for i in range(len(bag_train_output))]
        classifier.fit(bag_train_input, numeric_output)
        accuracy = 0
        for i in range(len(bag_test_input)):
            if classifier.predictOneSample(bag_test_input[i]) > 0.5:
                computed = 'positive'
            else:
                computed = 'negative'
            real = bag_train_output[i]
            if real == computed:
                accuracy += 1
                print("computed : " + computed + "     real : " + real)
            else:
                print("computed : " + computed + "     real : " + real + "   GRESIT")
        error = 1 - (accuracy / len(bag_test_input))
        print("error : " + str(error))
    return parser.parse_args()

if __name__=="__main__":
    args = parse_args()

    print "Reading data..."
    titles, bodies, tags_sets, _ = da.read_data(args.data)
    tags = [list(t)[0] for t in tags_sets]

    X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags)
    X_train_t, X_train_b = zip(*X_train)

    print "Generating features..."
    if args.feat == "bow":
        X, extractor = fe.bag_of_words(X_train_t, X_train_b)
    elif args.feat == "tfidf":
        X, extractor = fe.tfidf(X_train_t, X_train_b)
    elif args.feat == "bigram":
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2)
    else:
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3)

    print "Train..."
    if args.classifier == "knn":
        classifier = KNeighborsClassifier(n_neighbors=3)
    elif args.classifier == "log-reg":
        classifier = LogisticRegression(C=1e5)
    elif args.classifier == "dec-tree":
        classifier = DecisionTreeClassifier()
    else:
    return parser.parse_args()

if __name__=="__main__":
    args = parse_args()

    print "Reading data..."
    titles, bodies, tags_sets, _ = da.read_data(args.data, args.maxRows)
    tags = [list(t)[0] for t in tags_sets]

    X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags)
    X_train_t, X_train_b = zip(*X_train)

    print "Generating features..."
    if args.feat == "bow":
        X, extractor = fe.bag_of_words(X_train_t, X_train_b)
    elif args.feat == "tfidf":
        X, extractor = fe.tfidf(X_train_t, X_train_b)
    elif args.feat == "bigram":
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2)
    else:
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3)

    print "Train..."
    if args.classifier == "naive":
        classifier = MultinomialNB()

    classifier.fit(X, y_train)

    print "Test..."
    predictions = [classifier.predict(extractor.transform(t, b))[0] for t,b in X_test]
 def get_tweet_sentiment(self, tweet):
     '''
     Utility function to classify sentiment of passed tweet
     '''
     tweet_set = bag_of_words(self.clean_tweet(tweet))
     return self.classifier.classify(tweet_set)