def retrieveYoutube(category): multipleID = "" json_vec = [] # First request to search by keyword. get_youtube = requests.get( 'http://127.0.0.1:5100/api/youtube-videos?category=' + category + '&count=10') response_json = get_youtube.json() for line in response_json: multipleID += line['videoId'] + ',' multipleID = multipleID[:-1] # Second requests to get statistics of video using multiple IDs. get_youtube = requests.get( 'http://127.0.0.1:5100/api/youtube-multipleID?multipleID=' + multipleID) response_json = get_youtube.json() for json_obj in response_json: comments_response = requests.get( 'http://127.0.0.1:5100/api/youtube-comments?videoID=' + json_obj['videoId'] + '&count=10') json_data = comments_response.json() for c in json_data: comment = processVec(c['comment'].lower()) sentiment = classifier.classify( word_feature_vec(comment)) #pos eller neg c['sentiment'] = sentiment c['videoTitle'] = json_obj['title'] prob_dist = getProbabilityDist(comment) c['probability'] = prob_dist json_vec.append(c) return json_vec
def main(): start = time.time() print('--- Reading and processing training data ---') #movie_vec = read_movie('dataset/training_movie.tsv') twitter_vec1 = read_twitter1('dataset/train.csv') twitter_vec2 = read_twitter2('dataset/training_twitter.csv') total_vec = twitter_vec1+twitter_vec2 pos_vec = divide_by_polarity(total_vec, 'positive') neg_vec = divide_by_polarity(total_vec, 'negative') end = time.time() print('Preprocessing training data took: %f seconds' % (end-start)) print('\n--- Training data ---') start = time.time() pos_feats = [(word_feature_vec(f), 'positive') for f in pos_vec ] neg_feats = [(word_feature_vec(f), 'negative') for f in neg_vec ] trainfeats = pos_feats + neg_feats classifier = nltk.NaiveBayesClassifier.train(trainfeats) end = time.time() print('Training classifier took: %f seconds' % (end-start)) print('\n--- Reading test data ---') start = time.time() text_vec = read_twitter2('dataset/testing_twitter.csv') pos_vec = divide_by_polarity(text_vec, 'positive') neg_vec = divide_by_polarity(text_vec, 'negative') end = time.time() print('Reading test data took: %f seconds' % (end-start)) print('\n--- Preprocessing test data ---') pos_feats = [(word_feature_vec(f), 'positive') for f in pos_vec ] neg_feats = [(word_feature_vec(f), 'negative') for f in neg_vec ] testFeats = pos_feats + neg_feats print("Classifier accuracy: ", nltk.classify.util.accuracy(classifier, testFeats)) text_example = ["I hate terror", "Tesla is the f*****g best", "Spotify is a sinking ship", "Stefan Lofven has very good communication skills", "F*****g hell I don't want another exam", "I haven't got my results from the exam yet...."] for line in text_example: line = line.lower() wordvec = processVec(line) print(line + " : " + classifier.classify(word_feature_vec(wordvec))) classifier.show_most_informative_features(50) print('\n--- Saving model to pickle ---') save_classifier = open("naivebayes.pickle", "wb") pickle.dump(classifier, save_classifier) save_classifier.close()
def retrieveReddit(category): json_vec = [] # Request reddit comments get_reddit = requests.get( 'http://127.0.0.1:5100/api/reddit-comments?category=' + category + '&count=100') response_json = get_reddit.json() for json_obj in response_json: comment = processVec(json_obj['comment'].lower()) sentiment = classifier.classify( word_feature_vec(comment)) #pos eller neg json_obj['sentiment'] = sentiment prob_dist = getProbabilityDist(comment) json_obj['probability'] = prob_dist json_vec.append(json_obj) return json_vec
def retrieveTwitter(category): json_vec = [] # Request tweets get_twitter = requests.get('http://127.0.0.1:5100/api/tweets?category=' + category + '&count=200') response_json = get_twitter.json() for json_obj in response_json: json_obj['comment'] = json_obj['tweetText'] json_obj.pop('tweetText') tweet = clean_tweet(json_obj['comment']) sentiment = classifier.classify(word_feature_vec(tweet)) json_obj['sentiment'] = sentiment prob_dist = getProbabilityDist(tweet) json_obj['probability'] = prob_dist json_vec.append(json_obj) return json_vec
def getProbabilityDist(comment): prob_dist = classifier.prob_classify(word_feature_vec(comment)) return [prob_dist.prob('positive'), prob_dist.prob('negative')]