예제 #1
0
def get_feats(times_of_day, data_set):
    """Reads the golden standard CSV file and puts the content in bags of words"""
    print("\n##### Reading {} files...".format(data_set))
    feats = list()
    stemmer = SnowballStemmer("dutch")
    stop_words = set(stopwords.words('dutch'))

    for item in times_of_day:
        tweets_txt = open(item + '/' + data_set + '.txt')
        c = 0
        for line in tweets_txt.readlines():
            c += 1
            tokens = TweetTokenizer(preserve_case=False).tokenize(line)

            filtered_tokens = [
                w for w in tokens if w not in stop_words if not onlyDigits(w)
            ]
            #chars = [w for i in filtered_tokens for w in i]
            #bigrams = ngrams(filtered_tokens,2)
            bag = bag_of_words(filtered_tokens)
            feats.append((bag, item))
            #if c == 5000:
            #break

        print("{} {} tweets read".format(c, item))

    # return high_information(feats, times_of_day)
    return feats
def read_files():
    csvfile = open('OnionOrNot.csv', 'r', encoding='UTF-8').readlines()
    feats = list()
    for line in csvfile:
        data = str(line[:-3]).lower()
        category = line[-2]
        if category == '0' or category == '1':
            tokens = word_tokenize(data)

            # stemming
            # stemmer = SnowballStemmer("english")

            # punctuation removal
            # tokens = [stemmer.stem(filteredtoken) for filteredtoken in tokens]
            # punct = set(string.punctuation)
            # for item in tokens:
            #     if item in punct:
            #         tokens.remove(item)
            #     else:
            #         pass

            # stopwords removal
            # tokens = bag_of_non_stopwords(tokens)

            feats.append((bag_of_words(tokens), category))
        else:
            pass
    return feats
예제 #3
0
def main():
    '''
    Main function of the boilerplate code is the entry point of the 'chitragoopt' executable script (defined in setup.py).
    
    Use doctests, those are very helpful.
    
    >>> main()
    Hello
    >>> 2 + 2
    4
    '''

    lfeats = label_feats_from_corpus(movie_reviews)
    train_feats, test_feats = split_label_feats(lfeats, split=0.75)
    train_feats, test_feats = split_label_feats(lfeats, split=0.75)
    # nb_classifier = NaiveBayesClassifier.train(train_feats)
    print(sys.argv[1].split())
    negfeat = bag_of_words(sys.argv[1].split())

    f = open('my_classifier.pickle')
    nb_classifier = pickle.load(f)
    f.close()
    print(accuracy(nb_classifier, test_feats))
    print(nb_classifier.classify(negfeat))

    for x in range(0, 50):
        print(nb_classifier.classify(negfeat))
예제 #4
0
def read_files(categories, stopwords):
    feats = list()

    print("\n##### Reading files...")
    for category in categories:
        files = get_filenames_in_folder(category)
        num_tweets = 0
        for i in files:
            with open("{}/{}".format(category, i),
                      encoding='Latin-1') as csvfile:
                csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
                for tweet in csv_reader:
                    tweet_body = tweet[4]
                    clean = []
                    tokens = word_tokenize(tweet_body)

                    for i in tokens:
                        i = i.lower()  # lowering all text
                        if i in stopwords:
                            pass
                        if i in punct:  # remove punctuation
                            pass
                        else:
                            clean.append(i)  # Filter out all stopwords

                    bag = bag_of_words(clean)
                    feats.append((bag, category))
                    num_tweets += 1

        print("  Category {}, amount of tweets ={}".format(
            category, num_tweets))

    print("  Total, %i files read" % (len(feats)))
    return feats
예제 #5
0
def read_files(categories, stopwords):
    train_feats = []
    test_feats = []

    for category in categories:
        files = get_filenames_in_folder('Raad/' + category)
        feats = []
        train = []
        test = []
        for f in files:
            data = open('Raad/' + category + '/' + f, 'r',
                        errors="ignore").read()
            data = data.lower()
            for ch in '!"(),?:.\;"':
                data = data.replace(ch, " ")
            for ch in '\n':
                data = data.replace(ch, " ")

            for word in stopWords:
                pattern = re.compile('(\s*){}(\s*)'.format(word))
                data = pattern.sub(' ', data)

            tokens = word_tokenize(data)
            bag = bag_of_words(tokens)
            feats.append((bag, category))

        split = 0.9
        cutoff = int(len(feats) * split)
        train, test = feats[:cutoff], feats[cutoff:]
        train_feats = train_feats + train
        test_feats = test_feats + test

        #print ("  Category %s, %i files read" % (category, num_files))

    return train_feats, test_feats
def bigram(feats):
    bigrams = []
    for feat in feats:
        bigramsandwords = bag_of_bigrams_words(list(feat[0].keys()))
        bigram = bag_of_words_not_in_set(bigramsandwords, list(feat[0].keys()))
        cleanbigram = list(' '.join((a, b)) for a, b in bigram)
        bigrams.append(bag_of_words(cleanbigram))
    return bigrams
def NaiveBayes(review_set, csvFilePathNltk, chIndex):
    print('[' + time.strftime("%H:%M:%S") +
          ']$ Starting NLTK Naive Bayes Classifier Sentiment Analyzer...')

    movie_reviews.categories()

    lfeats = label_feats_from_corpus(movie_reviews)
    lfeats.keys()
    train_feats, test_feats = split_label_feats(lfeats, split=0.75)
    cv = cross_validation.KFold(len(train_feats),
                                n_folds=10,
                                shuffle=True,
                                random_state=None)
    for traincv, evalcv in cv:
        nb_classifier = NaiveBayesClassifier.train(
            train_feats[traincv[0]:traincv[len(traincv) - 1]])
        save_classifier = open("Classifier-CV.pickle", "wb")
        pickle.dump(nb_classifier, save_classifier)
        save_classifier.close()

    print('[' + time.strftime("%H:%M:%S") +
          ']$ Generating NLTK sentiment analysis based on ' +
          bookList[chIndex - 1] + '\'s ' + str(len(review_set)) +
          ' review(s).')
    sentiment_set = []
    for review in review_set:
        filtered_review = review
        for word in sw:
            filtered_review = filtered_review.replace(" " + word + " ", " ")

        diff_sw = len(filtered_review) / len(review)
        feats = bag_of_words(word_tokenize(filtered_review))
        sentiment = nb_classifier.classify(feats)
        probs = nb_classifier.prob_classify(feats)
        pos_prob = round(probs.prob('pos'), 4)
        neg_prob = round(probs.prob('neg'), 4)
        neu_prob = round(pos_prob - neg_prob, 4)
        pct_red = round(100 - (100 * diff_sw), 2)

        sentiment_set.append([sentiment, pos_prob, neg_prob, review[:-1]])

    if (chIndex != 10):
        print('[' + time.strftime("%H:%M:%S") +
              ']$ Writing NLTK results file (csv) at ' + csvFilePathNltk)
        generateCSV(sentiment_set, csvFilePathNltk)
        print('[' + time.strftime("%H:%M:%S") +
              ']$ Results file (csv) successfully generated at ' +
              csvFilePathNltk)
    else:
        print(sentiment, pos_prob, neg_prob, neu_prob, filtered_review,
              pct_red)

    print('[' + time.strftime("%H:%M:%S") +
          ']$ Finished NLTK Naive Bayes Classifier Sentiment Analyzer!')
def read_files(categories):
    feats = list()
    print("\n##### Reading files...")
    for category in categories:
        files = get_filenames_in_folder('Volkskrant/' + category)
        num_files = 0
        for f in files:
            data = open('Volkskrant/' + category + '/' + f,
                        'r').read().decode("utf-8")
            tokens = word_tokenize(data)
            bag = bag_of_words(tokens)
            feats.append((bag, category))
            #print len(tokens)
            num_files += 1
#			if num_files>=50: # you may want to de-comment this and the next line if you're doing tests (it just loads N documents instead of the whole collection so it runs faster
#				break

        print("  Category %s, %i files read" % (category, num_files))

    return feats
def read_files():
	feats = list ()
	print("\n##### Reading files...")
	song_data = pickle.load(open('final_songdata2.pickle','rb'))
	files = get_filenames_in_folder('lyric_files')
	num_files=0
	for f in files:
		data = open('lyric_files/' + f, 'r').read()
		song_id = int(f.split(".")[0])
		songwriter = song_data[song_id][2]

		# Remove all punctuation
		customTokenizer = RegexpTokenizer(r'\w+')
		tokens = customTokenizer.tokenize(data)

		# Lowercase everything
		tokens = [t.lower() for t in tokens]
		bag = bag_of_words(tokens)
		feats.append((bag, songwriter))
		num_files+=1

	return feats
예제 #10
0
def read_files(categories):
	feats = list ()
	print("\n##### Reading files...")
	for category in categories:
		files = get_filenames_in_folder('Volkskrant/' + category)
		num_files=0
		for f in files:

			data = open('Volkskrant/' + category + '/' + f, 'r', encoding='UTF-8').read()

			tokens = word_tokenize(data)
			tokens = [token.lower() for token in tokens]
			# remove items that are not alphabetics
			for token in tokens:
				if token.isalpha() == False:
					tokens.remove(token)
			# remove punctuation items
			for token in tokens:
				if token in '!;@#$%^&*().,/?~1234567890':
					tokens.remove(token)


			bag = bag_of_words(tokens)

			feats.append((bag, category))
			#print len(tokens)
			num_files+=1
			#if num_files>=50: # you may want to de-comment this and the next line if you're doing tests (it just loads N documents instead of the whole collection so it runs faster
			#	break

		
		print ("  Category %s, %i files read" % (category, num_files))

	print("  Total, %i files read" % (len(feats)))

	return feats
예제 #11
0
# ['neg', 'pos']

lfeats = label_feats_from_corpus(movie_reviews)

print(lfeats.keys())
# dict_keys(['neg', 'pos'])

train_feats, test_feats = split_label_feats(lfeats, split=0.75)
print(len(train_feats))

print(len(test_feats))

nb_classifier = NaiveBayesClassifier.train(train_feats)
print(nb_classifier.labels())

negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
print(nb_classifier.classify(negfeat))

posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible'])
print(nb_classifier.classify(posfeat))

print(accuracy(nb_classifier, test_feats))

probs = nb_classifier.prob_classify(test_feats[0][0])
print(probs.samples())

print(probs.max())

print(probs.prob('pos'))

print(probs.prob('neg'))
def read_files(pre_processing=False, ner_tags=False, add_ngrams=False, train_data=True):
	""" Reads the files for each category (asshole and not_asshole). """
	feats = list()
	print("\n##### Reading files...")
	stop_words = set(stopwords.words('dutch'))

	# load in ner tagger
	ner_list = list()
	if ner_tags:
		nlp = spacy.load("nl_core_news_sm")

	# ngrams list
	ngrams_list = list()

	# load train data
	if train_data:
		file = "Dutch_Abusive_Language_Corpus_Train.tsv"
	# load test data
	else:
		file = "Dutch_Abusive_Language_Corpus_Test.tsv"

	with open(file, "r", encoding="utf-8") as f:
		data = f.readlines()

	print("Loaded", str(len(data)), "tweets")

	for line in data:
		line_split = line.split("\t")
		tweet_text = line_split[1].strip().lower()
		tweet_label = line_split[-1].strip()
		tokens = word_tokenize(tweet_text)

		print(tweet_label, line_split[-2].strip(), tweet_text)

		if tweet_label != "NA":
			# get ner tags for text
			ner_set = set()
			if ner_tags:
				parsed_string = nlp(tweet_text)
				for token in parsed_string:
					if token.ent_type_ != "":
						ner_set.add(token.ent_type_)
				ner_list.append(ner_set)

			# Perform pre-processing to tweet
			if pre_processing:
				# lower and strip tokens
				new_tokens = list()
				for token in tokens:
					new_tokens.append(token.lower().strip())
				tokens = new_tokens

				# remove stopwords DEPRECATED
				'''
				new_tokens = list()
				for token in tokens:
					if token not in stop_words:
						new_tokens.append(token)
				tokens = new_tokens
				'''

			# get n-grams for text
			ngrams_set = set()
			if add_ngrams:
				ngrams_object = ngrams(tokens, 2)
				for grams in ngrams_object:
					ngrams_set.add(" ".join(grams))
				ngrams_list.append(ngrams_set)

			# Turn tokens into a bag of words
			bag = bag_of_words(tokens)
			feats.append((bag, tweet_label))

	print("Using", str(len(feats)), "tweets")

	return feats, ner_list, ngrams_list
예제 #13
0
# !pip install nltk
import nltk
#nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

from featx import bag_of_words
from featx import label_feats_from_corpus
from featx import split_label_feats

lfeats = label_feats_from_corpus(movie_reviews)

train_feats, test_feats = split_label_feats(lfeats, split=0.75)

from nltk.classify import NaiveBayesClassifier

nb_classifier = NaiveBayesClassifier.train(train_feats)
print(nb_classifier.labels())

review1 = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
print(nb_classifier.classify(review1))

review2 = bag_of_words(['kate', 'winslet', 'is', 'accessible'])
print(nb_classifier.classify(review2))

from nltk.classify.util import accuracy

print(accuracy(nb_classifier, test_feats))