Python FreqDist示例，nltk.FreqDist Python示例

示例#1

0

显示文件

文件： main.py 项目： CameronHurley95/DAEN-500-Unit-2.3

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')
textfile = open('NYTimesArticle.txt', mode='r')
allwords = textfile.read()
print(allwords)

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(allwords.lower())
print(tokens)

tokens = [token for token in tokens if token not in stopwords.words('english')]
print(tokens)

freq_dist = nltk.FreqDist(tokens)

print(freq_dist)
print(freq_dist.most_common(25))
freq_dist.plot(25)

示例#2

0

显示文件

文件： text_classification_2.py 项目： vinay911/ML_Projects

print(documents[:5])

all_words = []
short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

print(all_words[:10])

all_words = nltk.FreqDist(all_words)

#word_features = [w[0] for w in list(all_words.most_common(5000))]
word_features = list(all_words.keys())[:5000]

#print(word_features)


def find_features(document):
    words = set(word_tokenize(document))
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

示例#3

0

显示文件

文件： nightwork.py 项目： ahmedtaiye/tfeatslekan

dd = ', '.join(str(x) for x in g)
''.join(word_list)
str1 = ''.join(str(e) for e in word_list)

true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=1)
x = model.fit(matrix)
labels = x.labels_

lowercase = [x.lower() for x in word_list]
sents = lowercase
print(sents)

#wn.wup_similarity(sents, document)
from nltk.corpus import brown
freqs = nltk.FreqDist(w.lower() for w in sents)
print(freqs)

word_counter = {}
for word in dd:
    if word in word_counter:
        word_counter[word] += 1
    else:
        word_counter[word] = 1
popular_words = sorted(word_counter, key=word_counter.get, reverse=True)
top_ = popular_words[:100]

print(top_)

vectorizer = TfidfVectorizer(stop_words='english')
print(

示例#4

0

显示文件

def get_word_features(wordlist):
	wordlist = nltk.FreqDist(wordlist)
	word_features = wordlist.keys()
	return word_features

示例#5

0

显示文件

文件： ex3.py 项目： akotek/simple_nlp

def count_tokens(tokens, n=None):
    tokens = [t.lower() for t in tokens]
    freq_counter = nltk.FreqDist(tokens)
    return freq_counter.most_common(n)

示例#6

0

显示文件

# Find most common suffixes
from nltk.corpus import brown
import nltk
from pprint import pprint

suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
    suffix_fdist[word[-1:]] += 1

# Top 100 most common suffixes
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
pprint(common_suffixes)


# Create feature extraction function with common suffixes
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features


# Create Decision Tree Classifier to extract pos
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

示例#7

0

显示文件

文件： test_2.py 项目： Neo-Thokoa/StylometricsResearch

# Transform the authors' corpora into lists of word tokens
federalist_by_author_tokens = {}
federalist_by_author_length_distributions = {}
for author in authors:
    tokens = nltk.word_tokenize(federalist_by_author[author])

    # Filter out punctuation
    federalist_by_author_tokens[author] = ([
        token for token in tokens if any(c.isalpha() for c in token)
    ])

    # Get a distribution of token lengths
    token_lengths = [
        len(token) for token in federalist_by_author_tokens[author]
    ]
    federalist_by_author_length_distributions[author] = nltk.FreqDist(
        token_lengths)
    federalist_by_author_length_distributions[author].plot(15, title=author)

# Who are the authors we are analyzing?
authors = ("Hamilton", "Madison")

# Lowercase the tokens so that the same word, capitalized or not,
# counts as one word
for author in authors:
    federalist_by_author_tokens[author] = ([
        token.lower() for token in federalist_by_author_tokens[author]
    ])
federalist_by_author_tokens["Disputed"] = ([
    token.lower() for token in federalist_by_author_tokens["Disputed"]
])

示例#8

0

显示文件

def generate_data():
    vocabulary_size = 250
    unknown_token = "UNKNOWN_TOKEN"
    word_dim = 3

    print("Reading CSV file...")
    with open('raw_sentences.txt', 'r') as f:
        reader = csv.reader(f, skipinitialspace=True, delimiter='\n')
        # Split full comments into sentences
        sentences = itertools.chain(
            *[nltk.sent_tokenize(x[0].lower()) for x in reader])
        sentences = [' '.join(x.split()) for x in sentences
                     ]  # ['no , he says now .', 'and what did he do ?',, ...]
    print("Parsed %d sentences." % (len(sentences)))

    # Tokenize the sentences into words(문장을 각각의 단어로 분할)
    tokenized_sentences = [
        nltk.word_tokenize(sent) for sent in sentences
    ]  # [['no', ',', 'he', 'says', 'now', '.'],  ['and', 'what', 'did', 'he', 'do', '?'], ...]

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print("Found %d unique words tokens." % len(word_freq.items()))

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = word_freq.most_common(
        vocabulary_size -
        1)  # vocab <--- list  [('.', 80974), ('it', 29200), (',', 24583), ...]
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown_token)
    word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

    print("Using vocabulary size %d." % vocabulary_size)
    print(
        "The least frequent word in our vocabulary is '%s' and appeared %d times."
        % (vocab[-1][0], vocab[-1][1]))

    # Replace all words not in our vocabulary with the unknown token
    for i, sent in enumerate(tokenized_sentences):
        tokenized_sentences[i] = [
            w if w in word_to_index else unknown_token for w in sent
        ]

    print("\nExample sentence: '%s'" % sentences[0])
    print("\nExample sentence after Pre-processing: '%s'" %
          tokenized_sentences[0])

    indexed_sentences = [[word_to_index[w] for w in sent[:-1]]
                         for sent in tokenized_sentences]

    all_data = []
    for sen in indexed_sentences:
        if len(sen) >= word_dim + 1:
            for i in range(len(sen) - word_dim):
                all_data.append(sen[i:i + word_dim + 1])
    all_data = np.array(all_data, dtype=np.int16)
    np.random.shuffle(all_data)
    data = {
        'data': all_data,
        'word_to_index': word_to_index,
        'index_to_word': index_to_word,
        'vocab': vocab
    }
    with open('data.pickle', 'wb') as outfile:
        pickle.dump(data, outfile)

示例#9

0

显示文件

文件： views.py 项目： lion416/intemass_old

def student_submitanswer(request):
    logger.debug("in student_submitanswer()")
    print("in student_submitanswer()")

    alternative = False
    alternative_accepted = False
    response_data = {'state': 'failure'}
    proceed_further = False
    student, res = getSpByRequest(request, 'login')
    proceed_further = False
    if not student and res:
        return res
    try:
        qid = request.POST.get('questionid')
        question = Question.objects.get(id=qid)
        stdanswer = question.stdanswer

        # get the alternative standard answer for the answer
        if question.alt_stdanswer:
            alt_stdanswer = question.alt_stdanswer
            alternative = True
        else:
            alt_stdanswer = None
    except (Exception) as e:
        print e
        logger.error("question %s not exists" % qid)
        print("question %s does not exists" % qid)
        return HttpResponse(simplejson.dumps(response_data),
                            mimetype="application/json")

    try:
        answer_html = request.POST.get('answer_html')
        logger.info('this is the answer: %s ' % answer_html)
        try:
            answer_html = answer_html.decode("utf8").encode('ascii', 'ignore')
        except:
            try:
                answer_html = answer_html.encode('ascii', 'ignore')
            except:
                pass
        anstext = stripHTMLStrings(strip_tags(answer_html))
        try:
            anstext = anstext.decode("utf8").encode('ascii', 'ignore')
        except:
            try:
                anstext = anstext.encode('ascii', 'ignore')
            except:
                import traceback
                traceback.print_exc()
        stuanswer = StudentAnswer.objects.filter(
            question=question, student=student).latest('timestamp')
        stuanswer.html_answer = answer_html
        stuanswer.save()
        print "----------------------------------------------------------------------"
        print anstext

        # stuanswer.html_answer = answer_html
        # stuanswer.txt_answer = anstext
        # stuanswer.save()
        print "answer saveddddddddddddddddddd      done"
    except Exception as e:
        import traceback
        print 111111111111111111111111111111, traceback.format_exc()
        traceback.format_exc()
        logger.error("cant find stuanswer for question %s" % question)
        logger.error(str(traceback.format_exc()))
        return HttpResponse(simplejson.dumps(response_data),
                            mimetype="application/json")

    try:
        thumbnail_ids = [
            int(i) for i in request.POST['stuthumbnail_ids'].split(',') if i
        ]
        print 'thumbnail_ids@@@@@@@@@ = ', thumbnail_ids
    except:
        import traceback
        traceback.format_exc()
        thumbnail_ids = []
        logger.debug("no img for question %s" % question)
        pass
        #stdanswer algorithm to mark stuanswer
    if not stdanswer or not stuanswer:
        return HttpResponse(simplejson.dumps(response_data),
                            mimetype="application/json")
    else:
        textfdist = _loadlist(stdanswer.textfdist)
        slist = _loadlist(stdanswer.sentencelist)
        pointlist = _loadlist(stdanswer.pointlist)
        rulelist = _loadlist(stdanswer.rulelist)

        print textfdist
        print slist
        print pointlist
        print rulelist

        # for alternate answers
        if alternative and alt_stdanswer:
            alt_textfdist = _loadlist(alt_stdanswer.textfdist)
            alt_slist = _loadlist(alt_stdanswer.sentencelist)
            alt_pointlist = _loadlist(alt_stdanswer.pointlist)
            alt_rulelist = _loadlist(alt_stdanswer.rulelist)
        else:
            alt_textfdist = None
            alt_slist = None
            alt_pointlist = None
            alt_rulelist = None

        print alt_textfdist
        print alt_slist
        print alt_pointlist
        print alt_rulelist
    # TODO: add better handling so that progress bar doesn't get stuck when algorithm code has an exception
    try:
        ans = Answer()
        # initialize for alternative answer
        if alternative:
            alt_ans = Answer()
        else:
            alt_ans = None

        if USE_STUDENT_TEXT_DIST:
            ans_textfdist = get_text_distribution(anstext)
            if ans_textfdist:
                textfdist = ans_textfdist
                # save the same to alt_textfdist
                if alternative:
                    alt_textfdist = ans_textfdist
                else:
                    alt_textfdist = None
            if not textfdist:
                textfdist = nltk.FreqDist(['test'])
            # for alternate answer
            if not alt_textfdist:
                alt_textfdist = nltk.FreqDist(['test'])
        print ans.Analysis(anstext, textfdist, slist, pointlist, rulelist)
        mark, marklist, omitted, closeness_stats = ans.Analysis(
            anstext, textfdist, slist, pointlist, rulelist)
        if alternative:
            # calculate the same with alternate standard answer
            alt_mark, alt_marklist, alt_omitted, alt_closeness_stats = alt_ans.Analysis(
                anstext, alt_textfdist, alt_slist, alt_pointlist, alt_rulelist)
        else:
            alt_mark = alt_marklist = alt_omitted = alt_closeness_stats = None

        try:
            stucanvaslist = Canvas.objects.filter(question=question,
                                                  stuanswer=stuanswer)
            canvasmark = sum(stucanvas.mark for stucanvas in stucanvaslist)
            print 'canvasmark = ', canvasmark
        except Exception, e:
            import traceback
            traceback.format_exc()
            logger.error(e)
            canvasmark = 0
        # save mark result

        print '\n##############################################' * 2
        print 'thumbnail_ids = ', thumbnail_ids
        imgmark, stuansimages = __getimgmark(thumbnail_ids, question)
        print 'imgmark = ', imgmark
        # print 'stuansimages = ', stuansimages
        print '\n##############################################' * 2

        if not mark or not marklist:
            mark = 0
            marklist = list()
        if not omitted:
            omitted = list()

        if not alt_mark or not alt_marklist:
            alt_mark = 0
            alt_marklist = list()
        if not alt_omitted:
            alt_omitted = list()

        # Include optional listing with results from external grammar checker and optional closeness summarization
        grammar_issues = ans.critique_results[
            'report'] if ans.critique_results else ""
        closeness = ans.closeness if ans.closeness else 0.0

        # Alernative answer
        if alternative:
            alt_grammar_issues = alt_ans.critique_results[
                'report'] if alt_ans.critique_results else ""
            alt_closeness = alt_ans.closeness if alt_ans.closeness else 0.0

        # Apply min closeness band threshold for mark
        if (question.min_closeness_band > 0):
            band = int(closeness * NUM_CLOSENESS_BANDS - 0.001)
            if (band < question.min_closeness_band):
                logger.info(
                    "Zeroing mark (%s) as closeness band (%s) less then min (%s)"
                    % (mark, band, question.min_closeness_band))
                mark = 0

            if not mark and alternative:
                print "inside alternative marking analysis"
                band = int(alt_closeness * NUM_CLOSENESS_BANDS - 0.001)
                if (band < question.min_closeness_band):
                    alt_mark = 0

        if (stuanswer.mark <= mark + imgmark + canvasmark) or (stuanswer.mark
                                                               <= alt_mark):
            proceed_further = True
            stuanswer.html_answer = answer_html
            stuanswer.txt_answer = anstext
            stuanswer.save()

        print 'mark = ', mark, '\n'
        print 'marklist = ', marklist, '\n'
        print 'omitted = ', omitted, '\n'
        print 'closeness_stats = ', closeness_stats, '\n'

        print 'alt_mark = ', alt_mark, '\n'
        print 'alt_marklist = ', alt_marklist, '\n'
        print 'alt_omitted = ', alt_omitted, '\n'
        print 'alt_closeness_stats = ', alt_closeness_stats, '\n'

示例#10

0

显示文件

    text = re.sub(cleanit, '', raw)
    return text


f = open('wiki_00', "r", encoding="utf8")
raw = f.read()
raw = cleanhtmlfun(raw)

from nltk.tokenize import TreebankWordTokenizer
tbw = TreebankWordTokenizer()
tokens = tbw.tokenize(raw)
tokens = [''.join(c for c in s if c not in string.punctuation) for s in tokens]
tokens = [s for s in tokens if s]

trigrams = nltk.ngrams(tokens, 3)
fdist_trigrams = nltk.FreqDist(trigrams)
unique_trigrams = fdist_trigrams.B(
)  #This gives the total number of unique trigrams

import matplotlib.pyplot as plt

Y = fdist_trigrams.values()
Y = sorted(Y, reverse=True)
X = range(len(Y))
plt.figure()
plt.loglog(X, Y)
plt.xlabel('Trigram')
plt.ylabel('Frequency')
plt.title('Trigram Frequencies')
plt.grid()
plt.show()

示例#11

0

显示文件

文件： train-theano.py 项目： margarineHound/rnn-tutorial-rnnlm

    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(
        *[nltk.sent_tokenize(x[0].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = [
        "%s %s %s" % (sentence_start_token, x, sentence_end_token)
        for x in sentences
    ]
print("Parsed %d sentences." % (len(sentences)))

# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

print("Using vocabulary size %d." % vocabulary_size)
print(
    "The least frequent word in our vocabulary is '%s' and appeared %d times."
    % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):

示例#12

0

显示文件

import nltk
# nltk.download('brown')

from nltk.corpus import brown

print(brown.categories())

genres = ['fiction', 'humor', 'romance']
whwords = ['what', 'which', 'how', 'why', 'when', 'where', 'who']

for i in range(0, len(genres)):
    genre = genres[i]
    print()
    print("Analysing '" + genre + "' wh words")
    genre_text = brown.words(categories=genre)
    fdist = nltk.FreqDist(genre_text)
    for wh in whwords:
        print(wh + ':', fdist[wh], end=' ')

示例#13

0

显示文件

文件： Genre_NDCG_Score - BoW-Logis-Reg-Based.py 项目： lurker18/Genre_Prediction_Plot_Summary

    encoding='utf-8')
#movies.info()
sample = movies.loc[:, [
    'Title', 'Movie_ID', 'Synopsis', 'Genre1', 'Genre2', 'Genre3'
]]
train = sample
cols = ['Genre1', 'Genre2', 'Genre3']
train['Genre'] = list(train[cols].apply(
    lambda x: ','.join(x.dropna()).split(','), axis=1))
train.drop(['Genre1', 'Genre2', 'Genre3'], axis=1, inplace=True)
#len(train)
#train
all_genres = sum(train.Genre, [])
#len(set(all_genres))

all_genres = nltk.FreqDist(all_genres)  # 5-Genres
all_genres_df = pd.DataFrame({
    'Genre': list(all_genres.keys()),
    'Count': list(all_genres.values())
})

all_genres_df.groupby(by='Genre').sum().sort_values('Count', ascending=False)

g = all_genres_df.nlargest(columns="Count", n=50)
plt.figure(figsize=(12, 15))
ax = sns.barplot(data=g, x="Count", y="Genre")
ax.set(title='Summary of Genre Distribution', ylabel='Genres')
plt.show()

filters = [
    gsp.strip_tags, gsp.strip_punctuation, gsp.strip_multiple_whitespaces,

示例#14

0

显示文件

#make the query results looks like {"h": ["happy", "had"], "b": ["ball", "bat"]}
query_result = get_query_result(query_terms)
query_result_articleonly = []
for a_page in query_result['query']['pages']:
    query_result_articleonly.append(
        query_result['query']['pages'][a_page]['extract'])
results = filter_tags("".join(query_result_articleonly))

tagged_sent = pos_tag(results.split())
#print(tagged_sent)
adjs = [
    word for word, pos in tagged_sent
    if pos == 'JJ' or pos == 'JJR' or pos == "JJS"
]
#print(adjs)
freq_adjs = nltk.FreqDist(adjs)
freq_pair_list = freq_adjs.most_common(6)
freq_adj_list = []
for a_freq_adj in freq_pair_list:
    freq_adj_list.append(a_freq_adj[0])
#print(freq_adj_list)
freq_adj_firs = []
for a_fir in freq_adj_list:
    freq_adj_firs.append(a_fir[0])
freq_adj_firs = set(freq_adj_firs)
word_dict = {}
for a_fir in freq_adj_firs:
    alist = []
    for a_word in freq_adj_list:
        if a_fir == a_word[0]:
            alist.append(a_word)

示例#15

0

显示文件

def frequence_specific_word(text):
    all_words = []
    for w in movie_reviews.words():
        all_words.append(w.lower())
    all_words = nltk.FreqDist(all_words)
    return all_words[text]

示例#16

0

显示文件

文件： treatment_data_google.py 项目： fbtt/desafio-portal

len(all_text)

# clean the text
my_clean_text = clean_text(all_text)
len(my_clean_text)

# get the tokens of the cleaned text
tokens = word_tokenize(my_clean_text, 'portuguese')
len(tokens)

# remove stopwords from the tokens
tokens_without_sw = remove_stopwords(tokens)
len(tokens_without_sw)

# calculate the frequency of the tokens
freq = nltk.FreqDist(tokens_without_sw)

# plot word x count
fig = plt.figure(figsize=[10, 5])
freq.plot(20, cumulative=False)  # plot the frequency
plt.title('Palavras mais frequentes', fontsize=14)
plt.xlabel('Palavra', fontsize=14)
plt.xticks(rotation=90)
plt.ylabel('Contagem', fontsize=14)
plt.tight_layout()

# save fig
path_save_fig = folder_save_fig + 'paravras_mais_frequentes_google.png'
plt.savefig(path_save_fig)

# list with the 100 more common words

示例#17

0

显示文件

def select_best_keywords(metadata_table):
    table_to_process = metadata_table[["pr_title", "pr_abstract"]]
    table_to_process["pr_title"] = table_to_process["pr_title"].apply(
        lambda x: remove_stop_words(x))
    table_to_process["pr_abstract"] = table_to_process["pr_abstract"].apply(
        lambda x: remove_stop_words(x))

    print("Text Data after removing of stop-words")
    display(table_to_process)

    words_corpus = get_words_corpus(table_to_process)
    print(len(words_corpus))

    dist = nltk.FreqDist(
        words_corpus)  # Creating a distribution of words' frequencies
    grams = dist.most_common(1000)  # Obtaining the most frequent words
    bigrams = nltk.collocations.BigramAssocMeasures()
    trigrams = nltk.collocations.TrigramAssocMeasures()

    bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(
        words_corpus)
    trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(
        words_corpus)

    print("Showing first", 2000, "top-freqent words in the corpus")
    grams = pd.DataFrame(grams)
    grams.index = range(1, len(grams) + 1)
    grams.columns = ["Word", "Frequency"]
    display(grams)

    bi_filter = 7
    print(
        "Showing bigrams in the corpus found by Pointwise Mutual Information method"
    )
    print("Applying frequency filter: a bigramm occurs more than", bi_filter,
          "times")
    bigramFinder.apply_freq_filter(bi_filter)
    bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)),
                                  columns=['bigram',
                                           'PMI']).sort_values(by='PMI',
                                                               ascending=False)
    bigramPMITable["bigram"] = bigramPMITable["bigram"].apply(
        lambda x: ' '.join(x))
    display(bigramPMITable)

    tri_filter = 5
    print(
        "Showing trigrams in the corpus found by Pointwise Mutual Information method"
    )
    print("Applying frequency filter: a trigramm occurs more than", tri_filter,
          "times")
    trigramFinder.apply_freq_filter(tri_filter)
    trigramPMITable = pd.DataFrame(
        list(trigramFinder.score_ngrams(trigrams.pmi)),
        columns=['trigram', 'PMI']).sort_values(by='PMI', ascending=False)
    trigramPMITable["trigram"] = trigramPMITable["trigram"].apply(
        lambda x: ' '.join(x))
    display(trigramPMITable)

    gram_dict = grams.set_index('Word').T.to_dict('list')
    bigramPMIDict = bigramPMITable.set_index('bigram').T.to_dict('list')
    trigramPMIDict = trigramPMITable.set_index('trigram').T.to_dict('list')

    keyword_processor = KeywordProcessor()
    textrank_keyword_processor = KeywordProcessor()

    gram_dict.update(bigramPMIDict)
    bigramPMIDict.update(trigramPMIDict)

    #     print(gram_dict)
    print(
        "Extracting keywords from texts using Pointwise Mutual Information method and TextRank"
    )
    text_rank_key_words = dict()
    for i in range(0, len(table_to_process)):
        sentences = table_to_process.loc[i, "pr_abstract"]
        if sentences != None:
            keywords = get_keywords_by_textrank(sentences)
            if keywords != None:
                text_rank_key_words.update(keywords)
                print("Text", i, "- Done")
    for i in range(0, len(table_to_process)):
        sentences = table_to_process.loc[i, "pr_title"]
        if sentences != None:
            keywords = get_keywords_by_textrank(sentences)
            if keywords != None:
                text_rank_key_words.update(keywords)
                print("Text", i, "- Done")

    for keyword in gram_dict.keys():
        parts = keyword.split()
        parts = "_".join(parts)
        keyword_processor.add_keyword(keyword, parts)

    for keyword in text_rank_key_words.keys():
        parts = keyword.split()
        parts = "_".join(parts)
        textrank_keyword_processor.add_keyword(keyword, parts)

    print(len(keyword_processor.get_all_keywords()))
    print(len(textrank_keyword_processor.get_all_keywords()))
    print(len(text_rank_key_words))

    table_to_process["pr_abstract"] = table_to_process["pr_abstract"].apply(
        lambda x: merge_two_keywords_methods(x, textrank_keyword_processor,
                                             keyword_processor))
    table_to_process["pr_title"] = table_to_process["pr_title"].apply(
        lambda x: merge_two_keywords_methods(x, textrank_keyword_processor,
                                             keyword_processor))

    for i in range(0, len(table_to_process)):
        metadata_table.loc[i, "pr_title"] = table_to_process.loc[i, "pr_title"]
        metadata_table.loc[i,
                           "pr_abstract"] = table_to_process.loc[i,
                                                                 "pr_abstract"]

    print(
        "Comparison of Text Data after Keywords Extraction using Pointwise Mutual Information method and TextRank"
    )
    display(metadata_table[["title", "pr_title", "abstract", "pr_abstract"]])

    print("Extracting keywords from texts using TF/IDF")
    dataset = []
    for i in range(0, len(table_to_process["pr_abstract"])):
        sentences = table_to_process.loc[i, "pr_abstract"]
        if sentences != None:
            sentences = " ".join(sentences)
            dataset.append(sentences)

    tfIdfVectorizer = TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(dataset)

    index = 0
    for i in range(0, len(metadata_table)):
        if table_to_process.loc[i, "pr_abstract"] == None:
            continue
        metadata_table.loc[i, "pr_abstract"] = retain_best_tf_idf_keywords(
            table_to_process.loc[i, "pr_abstract"], index, tfIdf,
            tfIdfVectorizer)
        index += 1
    print("Extracting keywords from texts using TF/IDF")
    dataset = []
    for i in range(0, len(table_to_process["pr_title"])):
        sentences = table_to_process.loc[i, "pr_title"]
        if sentences != None:
            sentences = " ".join(sentences)
            dataset.append(sentences)

    tfIdfVectorizer = TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(dataset)

    index = 0
    for i in range(0, len(metadata_table)):
        if table_to_process.loc[i, "pr_title"] == None:
            continue
        metadata_table.loc[i, "pr_title"] = retain_best_tf_idf_keywords(
            table_to_process.loc[i, "pr_title"], index, tfIdf, tfIdfVectorizer)
        index += 1
    return metadata_table

示例#18

0

显示文件

def named_entity_recog(x):
    import nltk
    return nltk.ne_chunk([x])


NER_word = filtered_data.map(named_entity_recog)
print(NER_word.collect())

#Stemming and Lemmatization
nltk.download('wordnet')


def lemma(x):
    import nltk
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(x)


lem_words = filtered_data.map(lemma)
print(lem_words.collect())

#Text Classification .
#find the words which has the highest frequency and sort them in decreasing order of their frequency.
text_Classifi = filtered_data.flatMap(
    lambda x: nltk.FreqDist(x.split(",")).most_common()).map(
        lambda x: x).reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[1],
                                                            ascending=False)
topcommon_data = text_Classifi.take(100)  #take first 100 most common words

topcommon_data

示例#19

0

显示文件

def rmp_sentment_analysis(src):
    df = pd.read_csv(
        src,
        usecols=['professor_name', 'school_name', 'star_rating', 'comments'])
    high_professor_comment = df[(df['star_rating'] >= 4.0) & (
        df['star_rating'] <= 5.0)]['comments'].sample(10000).dropna().tolist()
    text = ' '.join(high_professor_comment)
    tokens = [t.lower() for t in re.split(r'[^\w\s]|\s', text) if t != '']
    print(tokens)

    sr = stopwords.words('english')
    add_stopword = ['him.', 'took', 'one', 'took', 'day']
    sr = sr + add_stopword
    clean_tokens = tokens[:]
    for token in tokens:
        if token in stopwords.words('english'):
            clean_tokens.remove(token)

    freq = nltk.FreqDist(clean_tokens)
    for key, val in freq.items():
        print(str(key) + ':' + str(val))
    freq.plot(20, cumulative=False)

    # https://github.com/cjhutto/vaderSentiment#about-the-scoring
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()

    # 高分教授的情感
    high_professor_sentiment = []
    pos_num = 0
    neg_num = 0
    for sentence in high_professor_comment:
        vs = analyzer.polarity_scores(sentence)
        # print("{:-<65} {}".format(sentence, str(vs)))
        # print(vs['compound'])
        high_professor_sentiment.append(vs['compound'])
        if vs['compound'] >= 0.05:
            print('positive')
            pos_num += 1
        else:
            print('negtive')
            neg_num += 1
    print(pos_num, neg_num)
    ratio = pos_num / (pos_num + neg_num)
    print('ratio:', ratio)
    high_mean = np.mean(high_professor_sentiment)
    high_std = np.std(high_professor_sentiment)
    print('平均数：', high_mean, '标准差', high_std)

    low_professor_comment = df[(df['star_rating'] >= 1.0) & (
        df['star_rating'] <= 2.0)]['comments'].sample(10000).dropna().tolist()
    # 低分教授
    print('低分教授')
    low_professor_sentiment = []
    low_pos_num = 0
    low_neg_num = 0
    for sentence in low_professor_comment:
        vs = analyzer.polarity_scores(sentence)
        # print("{:-<65} {}".format(sentence, str(vs)))
        # print(vs['compound'])
        low_professor_sentiment.append(vs['compound'])
        if vs['compound'] <= -0.05:
            # print('positive')
            low_pos_num += 1
        else:
            # print('negtive')
            low_neg_num += 1
    print(low_pos_num, low_neg_num)
    low_ratio = low_pos_num / (low_pos_num + low_neg_num)
    print('ratio:', low_ratio)
    low_mean = np.mean(low_professor_sentiment)
    low_std = np.std(low_professor_sentiment)
    print('平均数：', low_mean, '标准差', low_std)

示例#20

0

显示文件

文件： project2.py 项目： chiarafazzone/python-scripts

def top10POS(tokensPOS):
    seqPOS = estraiSeqPOS(tokensPOS)
    freqDist = nltk.FreqDist(seqPOS)
    return freqDist.most_common(10)

示例#21

0

显示文件

文件： test.py 项目： mchon89/Satoshi_Nakamoto

    len(satoshi_nakamoto_tokens) / len(sentence)))

# entity extraction
satoshi = nlp(satoshi_nakamoto_text)
entity_list = [
    'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
    'WORK_OF_ART', 'LANGUAGE', 'LAW'
]

satoshi_list = []
for i in range(len(satoshi)):
    if satoshi[i].ent_type_ in entity_list:
        satoshi_list.append(satoshi[i].text)

print("entity extration: {}".format(satoshi_list))

# noun chunk
noun_chunk = []
for chunk in satoshi.noun_chunks:
    noun_chunk.append(chunk.text)

print("noun_chunk: {}".format(noun_chunk))

# n gram
import nltk

tokens = nltk.tokenize.word_tokenize(satoshi_nakamoto_text)
bgs = nltk.ngrams(tokens, 3)
fdist = nltk.FreqDist(bgs)
print(fdist.most_common(30))

示例#22

0

显示文件

文件： project2.py 项目： chiarafazzone/python-scripts

def top20Tokens(tokens1):
    esclusi = [',', '.', ':', ';']  #non contenenti punteggiatura
    nonPunteggiatura = [a[0] for (a, b) in tokens1 if not (a[1] in esclusi)]
    fdist = nltk.FreqDist(nonPunteggiatura)
    return fdist.most_common(20)

示例#23

0

显示文件

# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import gutenberg
macbeth = gutenberg.words("shakespeare-macbeth.txt")
stopwords = set(nltk.corpus.stopwords.words())
fd = nltk.FreqDist([
    w for w in macbeth
    if w.lower() not in stopwords and len(w) > 3 and w.isalpha()
])
d = list(fd.keys())
print(d[0:50])

示例#24

0

显示文件

文件： project2.py 项目： chiarafazzone/python-scripts

def main():
    if not len(sys.argv[1:]):
        usage()
    # nomi dei file passati come argomenti
    file1 = sys.argv[1]
    file2 = sys.argv[2]
    print '-' * 80
    print " Progetto di Linguistica  Computazionale"
    print " Programma 2 "
    print '-' * 80
    tokensText1, frasiMarkov1, frasi1 = openTextFile(
        file1)  # tokens del testo, frasi per es. Markov
    tokensText2, frasiMarkov2, frasi2 = openTextFile(file2)
    tokensPOS1, namedEntityDict1 = analisiLing(
        frasi1)  # processo di annotazione ed NE
    tokensPOS2, namedEntityDict2 = analisiLing(frasi2)
    top10POS1 = top10POS(tokensPOS1)  #10 PoS più frequenti
    top10POS2 = top10POS(tokensPOS2)
    print '-' * 80
    print " 10 PoS (Part-of-Speech) più frequenti"
    print '-' * 80
    print ' Testo:', file1, ' ' * 12, '| Testo:', file2
    print '-' * 80
    for i in range(min(len(top10POS1), len(top10POS2))):  # stampa il confronto
        pos1, freq1 = top10POS1[i]
        pos2, freq2 = top10POS2[i]
        print " {: <20}{: <12}| {: <20}{: <12}".format(pos1, freq1, pos2,
                                                       freq2)
    print '-' * 80
    print " 20 Token più  frequenti"
    print '-' * 80
    print ' Testo:', file1, ' ' * 12, '| Testo:', file2
    print '-' * 80
    topTokens1 = top20Tokens(getBigrams(tokensPOS1))  #20 Token più  frequenti
    topTokens2 = top20Tokens(getBigrams(tokensPOS2))
    for i in range(min(len(topTokens1),
                       len(topTokens2))):  # stampa il confronto
        tok1, freq1 = topTokens1[i]
        tok2, freq2 = topTokens2[i]
        print " {: <25}{: <7}| {: <25}{: <7}".format(tok1.encode('utf-8'),
                                                     freq1,
                                                     tok2.encode('utf-8'),
                                                     freq2)
    print '-' * 80
    print '-' * 80
    print " 20 Bigrammi più frequenti"
    print '-' * 80
    print ' Testo:', file1, ' ' * 12, '| Testo:', file2
    print '-' * 80
    bigrammi1 = getBigrams(tokensPOS1)  # Bigrammi del testo annotati
    bigrammi2 = getBigrams(tokensPOS2)
    topBigrams1 = top20Bigrams(bigrammi1)  #20 Bigrammi più frequenti
    topBigrams2 = top20Bigrams(bigrammi2)
    for i in range(min(len(topBigrams1),
                       len(topBigrams2))):  # stampa il confronto
        big1, freq1 = topBigrams1[i]
        big2, freq2 = topBigrams2[i]
        print " {: <25}{: <7}| {: <25}{: <7}".format(
            big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), freq1,
            big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), freq2)
    print '-' * 80
    print " 20 Trigrammi più frequenti"
    print '-' * 80
    print ' Testo:', file1, ' ' * 18, '| Testo:', file2
    print '-' * 80
    topTrigrams1 = top20Trigrams(
        getTrigrams(tokensPOS1))  #20 Trigrammi più frequenti
    topTrigrams2 = top20Trigrams(getTrigrams(tokensPOS2))
    for i in range(min(len(topTrigrams1),
                       len(topTrigrams2))):  # stampa il confronto
        tri1, freq1 = topTrigrams1[i]
        tri2, freq2 = topTrigrams2[i]
        print " {: <35}{: <3}| {: <35}{: <3}".format(
            tri1[0].encode('utf-8') + " " + tri1[1].encode('utf-8') + " " +
            tri1[2].encode('utf-8'), freq1, tri2[0].encode('utf-8') + " " +
            tri2[1].encode('utf-8') + " " + tri2[2].encode('utf-8'), freq2)
    print '-' * 80
    print '-' * 80
    print " 20 Bigrammi - Aggettivo e Sostantivo"
    print '-' * 80
    setBigramsAggSost1 = bigrammiAggSos(
        bigrammi1)  # Set di Bigrammi: aggettivo , sostantivo
    setBigramsAggSost2 = bigrammiAggSos(bigrammi2)
    vocabFreq1 = dictVocabFreq(
        set(tokensText1), tokensText1)  # dizionario con tokens e frequenze
    vocabFreq2 = dictVocabFreq(set(tokensText2), tokensText2)
    bigrams1 = getBigrams(tokensText1)  # Bigrammi del testo non annotati
    bigrams2 = getBigrams(tokensText2)
    dictBigrammi1 = infoBigrams(
        bigrams1, setBigramsAggSost1, vocabFreq1
    )  # dizionario con info bigrammi_AggSost : (F(u,v), F(u), F(v), P(v|u), P(u,v))
    dictBigrammi2 = infoBigrams(bigrams2, setBigramsAggSost2, vocabFreq2)
    forzaAssoc1 = getLocalMutualInformation(
        dictBigrammi1, vocabFreq1)  # Local Mutual Information
    forzaAssoc2 = getLocalMutualInformation(dictBigrammi2, vocabFreq2)
    print " Con probabilità congiunta massima P(u,v):"
    print ' Testo:', file1, ' ' * 12, '|    Testo:', file2
    print '-' * 80
    probCongiunta1 = ordinaProbCongiunta(dictBigrammi1)[:20]
    probCongiunta2 = ordinaProbCongiunta(dictBigrammi2)[:20]
    for i in range(min(len(probCongiunta1),
                       len(probCongiunta2))):  # stampa il confronto
        big1, p1 = probCongiunta1[i]
        big2, p2 = probCongiunta2[i]
        p1 = Decimal(str(p1[4])).quantize(Decimal('.00001'),
                                          rounding=ROUND_DOWN)
        p2 = Decimal(str(p2[4])).quantize(Decimal('.00001'),
                                          rounding=ROUND_DOWN)
        print " {: <25}{: <7}|    {: <25}{: <7}".format(
            big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), p1,
            big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), p2)

    print '-' * 80
    print " Con probabilità condizionata massima P(v|u):"
    print ' Testo:', file1, ' ' * 12, '|    Testo:', file2
    print '-' * 80
    probCondizionata1 = ordinaProbCondizionata(dictBigrammi1)[:20]
    probCondizionata2 = ordinaProbCondizionata(dictBigrammi2)[:20]
    for i in range(min(len(probCondizionata1),
                       len(probCondizionata2))):  # stampa il confronto
        big1, p1 = probCondizionata1[i]
        big2, p2 = probCondizionata2[i]
        p1 = Decimal(str(p1[3])).quantize(Decimal('.0001'),
                                          rounding=ROUND_DOWN)
        p2 = Decimal(str(p2[3])).quantize(Decimal('.0001'),
                                          rounding=ROUND_DOWN)
        print " {: <25}{: <7}|    {: <25}{: <7}".format(
            big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), p1,
            big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), p2)

    print '-' * 80
    print " Con forza associativa massima (LMI):"
    print ' Testo:', file1, ' ' * 12, '|    Testo:', file2
    print '-' * 80
    topLMI1 = ordinaDict(forzaAssoc1)[:20]
    topLMI2 = ordinaDict(forzaAssoc2)[:20]
    for i in range(min(len(topLMI1), len(topLMI2))):  # stampa il confronto
        big1, lmi1 = topLMI1[i]
        big2, lmi2 = topLMI2[i]
        lmi1 = Decimal(str(lmi1)).quantize(Decimal('.001'),
                                           rounding=ROUND_DOWN)
        lmi2 = Decimal(str(lmi2)).quantize(Decimal('.001'),
                                           rounding=ROUND_DOWN)
        print " {: <25}{: <7}|    {: <25}{: <7}".format(
            big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), lmi1,
            big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), lmi2)
    print '-' * 80
    print '-' * 80
    print " Le due frasi con probabilità più alta"
    print '-' * 80
    distrFreq1 = nltk.FreqDist(tokensText1)
    topFrase1, probFraseMax1 = maxProbMarkov0(
        len(tokensText1), distrFreq1,
        frasiMarkov1)  # frase Markov 0 con probabilità più alta
    print " 1° Frase calcolata attraverso un modello di Markov di ordine 0:"
    print '-' * 80
    print ' Testo:', file1
    print ' "', " ".join(topFrase1).encode('utf-8'), '"'
    print " Probabilità:", probFraseMax1
    print
    distrFreq2 = nltk.FreqDist(tokensText2)
    topFrase2, probFraseMax2 = maxProbMarkov0(
        len(tokensText2), distrFreq2,
        frasiMarkov2)  # frase Markov 0 con probabilità più alta
    print ' Testo:', file2
    print ' "', " ".join(topFrase2).encode('utf-8'), '"'
    print " Probabilità:", probFraseMax2
    print '-' * 80
    infoBigrammi1 = infoBigrams(
        bigrams1, set(bigrams1), vocabFreq1
    )  # dizionario con info bigrammi : (F(u,v), F(u), F(v), P(v|u), P(u,v))
    infoBigrammi2 = infoBigrams(bigrams2, set(bigrams2), vocabFreq2)
    topFrase1, probFraseMax1 = maxProbMarkov1(
        len(tokensText2), frasiMarkov1,
        infoBigrammi1)  # frase Markov 1 con probabilità più alta
    print " 2° Frase calcolata attraverso un modello di Markov di ordine 1:"
    print '-' * 80
    print ' Testo:', file1
    print ' "', " ".join(topFrase1).encode('utf-8'), '"'
    print " Probabilità:", probFraseMax1
    print
    distrFreq2 = nltk.FreqDist(tokensText2)
    topFrase2, probFraseMax2 = maxProbMarkov1(
        len(tokensText2), frasiMarkov2,
        infoBigrammi2)  # frase Markov 1 con probabilità più alta
    print ' Testo:', file2
    print ' "', " ".join(topFrase2).encode('utf-8'), '"'
    print " Probabilità:", probFraseMax2
    print '-' * 80
    print '-' * 80
    print " 20 nomi propri di persona più frequenti"
    print ' Testo:', file1, ' ' * 18, '|    Testo:', file2
    print '-' * 80
    topPerson1 = nltk.FreqDist(namedEntityDict1["PERSON"]).most_common(
        20)  #i 20 nomi propri di persona più frequenti
    topPerson2 = nltk.FreqDist(namedEntityDict2["PERSON"]).most_common(20)
    for i in range(min(len(topPerson1),
                       len(topPerson2))):  # stampa il confronto
        tok1, freq1 = topPerson1[i]
        tok2, freq2 = topPerson2[i]
        print " {: <35}{: <3}| {: <35}{: <3}".format(tok1.encode('utf-8'),
                                                     freq1,
                                                     tok2.encode('utf-8'),
                                                     freq2)
    print '-' * 80
    print " 20 nomi propri di luogo più frequenti"
    print ' Testo:', file1, ' ' * 18, '|    Testo:', file2
    print '-' * 80
    topGpe1 = nltk.FreqDist(namedEntityDict1["GPE"]).most_common(
        20)  #i 20 nomi propri di luogo più frequenti
    topGpe2 = nltk.FreqDist(namedEntityDict2["GPE"]).most_common(20)
    for i in range(min(len(topGpe1), len(topGpe2))):  # stampa il confronto
        tok1, freq1 = topGpe1[i]
        tok2, freq2 = topGpe2[i]
        print " {: <35}{: <3}| {: <35}{: <3}".format(tok1.encode('utf-8'),
                                                     freq1,
                                                     tok2.encode('utf-8'),
                                                     freq2)
    print '-' * 80
    sys.exit(2)

示例#25

0

显示文件

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    #tokens=[word for word in tokens if not word in stopwords.words()]
    words = [word for word in tokens if word.isalpha()]
    # remove stop words from tokens
    stopped_tokens = [
        i for i in words if (not i in en_stop and len(str(i)) > 2)
    ]
    texts.append(stopped_tokens)
    for word in stopped_tokens:
        cnt[word] += 1
    #Create your bigrams
    bgs = nltk.bigrams(stopped_tokens)
    #compute frequency distribution for all the bigrams in the text
    fdist = nltk.FreqDist(bgs)
    for k, v in fdist.items():
        cnt2[k] += 1

    fdist = nltk.FreqDist(nltk.trigrams(stopped_tokens))
    for k, v in fdist.items():
        cnt3[k] += 1

freq = cnt + cnt2 + cnt3

complete_data = []
for value, count in freq.most_common():
    complete_data.append([value, count])

writer = pd.DataFrame(complete_data, columns=['Keywords', 'Frequency'])
writer.to_csv("FrequencyUniBiGram_Care_Products.csv", index=None, header=True)

示例#26

0

显示文件

def executar(experimento, nome_Base, acento):
    nomeBase = nome_Base
    path = experimento + nomeBase
    # print('executando:\n'+path)
    # print('Sem acento:\n'+('Sim' if(acento) else 'Não'))
    # nomeBase = 'sce/balanced/colecao_dourada_3_class_balanced.csv'
    # path = "experimento2/"+nomeBase
    base = readBase(nomeBase)

    tamBase = len(base)
    i = 0
    documents = []
    #print base[0][0].split()
    tknzr = nltk.tokenize.TweetTokenizer()

    while (i < tamBase):
        if (acento):
            w = remocaoacento(tknzr.tokenize(base[i][0]))
        else:
            w = tknzr.tokenize(base[i][0])
        w = remocaopontos(w)
        conteudoLista = (w, base[i][1])
        documents.append(conteudoLista)
        i += 1

    stemmer = nltk.stem.RSLPStemmer()

    # h=0
    # j=len(documents)
    # while (h<j):
    #    g=len(documents[h][0])
    #    f=0
    #    while(f<g):
    #        stemmer.stem(documents[h][0][f])
    #        f+=1
    #    h += 1

    random.shuffle(documents)

    all_words = []

    k = 0
    l = len(documents)
    while (k < l):
        m = len(documents[k][0])
        n = 0
        while (n < m):
            all_words.append(documents[k][0][n])
            n += 1
        k += 1

    # print(str(all_words))

    #all_words = nltk.FreqDist(all_words) #calcula frequencia de palavras, definir o limite de palavras
    #all_words = nltk.LaplaceProbDist(nltk.FreqDist(all_words))
    #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words))
    all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1)

    #nltk.WittenBellProbDist() procurar como mudar o ngram
    #all_words = nltk.MLEProbDist(nltk.FreqDist(all_words))

    def wordbigram(word_feature):
        bigram = []
        i = 0
        l = len(word_feature) - 1
        while (i < l):
            s = tuple([
                stemmer.stem(word_feature[i]),
                stemmer.stem(word_feature[i + 1])
            ])
            bigram.append(s)
            i += 1
        return bigram

    def removerpalavras(todas_palavras, document):
        #remover as palavras que não estãoem todas as palavras
        linha = []
        for w in document:
            if (w in todas_palavras):
                linha.append(w)
        return linha

    def wordFeature(documents):
        #cria um dicionario de dados
        dicionario = []
        for w in documents:
            for q in w[0]:
                if (not q in dicionario):
                    dicionario.append(q)
        return dicionario

    documents = [[removerpalavras(all_words.samples(), w[0]), w[1]]
                 for w in documents]
    documents = [[wordbigram(w[0]), w[1]] for w in documents]
    word_features = wordFeature(
        documents
    )  #se 0usando FreqDistlista com palavras que aparecem mais de 3000

    # print(str(len(word_features)))
    # exit()
    # word_features = list(all_words.samples())#se 0usando FreqDistlista com palavras que aparecem mais de 3000

    def find_features(document):
        # words = set(document)
        features = {}
        i = 0
        l = len(word_features)
        while (i < l):
            features[str(i)] = (word_features[i] in document)
            i += 1
        return features

    featuresets = [(find_features(rev), category)
                   for (rev, category) in documents]
    # print(str(featuresets))
    # for (w,category) in featuresets:
    #     print(str(len(w))+","+category+"\n")

    kfold = 4

    # baseInteira = featuresets

    tamT = len(featuresets)
    divisao = tamT // kfold

    ###### ajustar divisao
    baseDividida1 = featuresets[0:divisao]
    baseDividida2 = featuresets[divisao:(divisao * 2)]
    baseDividida3 = featuresets[(divisao * 2):(divisao * 3)]
    baseDividida4 = featuresets[(divisao * 3):tamT]

    #tamT = len(featuresets)
    #umQuarto = tamBase/4

    #training_set = featuresets[umQuarto:]
    #testing_set = featuresets[:umQuarto]

    #training_set = featuresets[100:]
    #testing_set = featuresets[0:100]

    ########################## 1 rodada
    #print "## RODADA 1 ##"

    # print("treino")
    training_set = baseDividida2 + baseDividida3 + baseDividida4
    testing_set = baseDividida1

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp1 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB1 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB1 = 0
    while (g < len(precisaoMNB1)):
        somaPMNB1 = somaPMNB1 + precisaoMNB1[g]
        g = g + 1
    MNBpt1 = (somaPMNB1 / len(precisaoMNB1)) * 100
    MNBrp1 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB1 = 0
    while (g < len(recallMNB1)):
        somaRMNB1 = somaRMNB1 + recallMNB1[g]
        g = g + 1
    MNBrt1 = (somaRMNB1 / len(recallMNB1)) * 100
    MNBfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB1 = 0
    while (g < len(f1MNB1)):
        somaFMNB1 = somaFMNB1 + f1MNB1[g]
        g = g + 1
    MNBft1 = (somaFMNB1 / len(f1MNB1)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp1 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR1 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR1 = 0
    while (g < len(precisaoR1)):
        somaPR1 = somaPR1 + precisaoR1[g]
        g = g + 1
    Rpt1 = (somaPR1 / len(precisaoR1)) * 100
    Rrp1 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR1 = 0
    while (g < len(recallR1)):
        somaRR1 = somaRR1 + recallR1[g]
        g = g + 1
    Rrt1 = (somaRR1 / len(recallR1)) * 100
    Rfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR1 = 0
    while (g < len(f1R1)):
        somaFR1 = somaFR1 + f1R1[g]
        g = g + 1
    Rft1 = (somaFR1 / len(f1R1)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp1 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL1 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL1 = 0
    while (g < len(precisaoL1)):
        somaPL1 = somaPL1 + precisaoL1[g]
        g = g + 1
    Lpt1 = (somaPL1 / len(precisaoL1)) * 100
    Lrp1 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL1 = 0
    while (g < len(recallL1)):
        somaRL1 = somaRL1 + recallL1[g]
        g = g + 1
    Lrt1 = (somaRL1 / len(recallL1)) * 100
    Lfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL1 = 0
    while (g < len(f1L1)):
        somaFL1 = somaFL1 + f1L1[g]
        g = g + 1
    Lft1 = (somaFL1 / len(f1L1)) * 100

    ######################## Rodada 2
    #print "## RODADA 2 ##"

    training_set = baseDividida1 + baseDividida3 + baseDividida4
    testing_set = baseDividida2

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp2 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB2 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB2 = 0
    while (g < len(precisaoMNB2)):
        somaPMNB2 = somaPMNB2 + precisaoMNB2[g]
        g = g + 1
    MNBpt2 = (somaPMNB2 / len(precisaoMNB2)) * 100
    MNBrp2 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB2 = 0
    while (g < len(recallMNB2)):
        somaRMNB2 = somaRMNB2 + recallMNB2[g]
        g = g + 1
    MNBrt2 = (somaRMNB2 / len(recallMNB2)) * 100
    MNBfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB2 = 0
    while (g < len(f1MNB2)):
        somaFMNB2 = somaFMNB2 + f1MNB2[g]
        g = g + 1
    MNBft2 = (somaFMNB2 / len(f1MNB2)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp2 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR2 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR2 = 0
    while (g < len(precisaoR2)):
        somaPR2 = somaPR2 + precisaoR2[g]
        g = g + 1
    Rpt2 = (somaPR2 / len(precisaoR2)) * 100
    Rrp2 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR2 = 0
    while (g < len(recallR2)):
        somaRR2 = somaRR2 + recallR2[g]
        g = g + 1
    Rrt2 = (somaRR2 / len(recallR2)) * 100
    Rfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR2 = 0
    while (g < len(f1R2)):
        somaFR2 = somaFR2 + f1R2[g]
        g = g + 1
    Rft2 = (somaFR2 / len(f1R2)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp2 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL2 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL2 = 0
    while (g < len(precisaoL2)):
        somaPL2 = somaPL2 + precisaoL2[g]
        g = g + 1
    Lpt2 = (somaPL2 / len(precisaoL2)) * 100
    Lrp2 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL2 = 0
    while (g < len(recallL2)):
        somaRL2 = somaRL2 + recallL2[g]
        g = g + 1
    Lrt2 = (somaRL2 / len(recallL2)) * 100
    Lfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL2 = 0
    while (g < len(f1L2)):
        somaFL2 = somaFL2 + f1L2[g]
        g = g + 1
    Lft2 = (somaFL2 / len(f1L2)) * 100

    ##################### rodada 3
    #print "## RODADA 3 ##"

    training_set = baseDividida1 + baseDividida2 + baseDividida4
    testing_set = baseDividida3

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp3 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB3 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB3 = 0
    while (g < len(precisaoMNB3)):
        somaPMNB3 = somaPMNB3 + precisaoMNB3[g]
        g = g + 1
    MNBpt3 = (somaPMNB3 / len(precisaoMNB3)) * 100
    MNBrp3 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB3 = 0
    while (g < len(recallMNB3)):
        somaRMNB3 = somaRMNB3 + recallMNB3[g]
        g = g + 1
    MNBrt3 = (somaRMNB3 / len(recallMNB3)) * 100
    MNBfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB3 = 0
    while (g < len(f1MNB3)):
        somaFMNB3 = somaFMNB3 + f1MNB3[g]
        g = g + 1
    MNBft3 = (somaFMNB3 / len(f1MNB3)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp3 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR3 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR3 = 0
    while (g < len(precisaoR3)):
        somaPR3 = somaPR3 + precisaoR3[g]
        g = g + 1
    Rpt3 = (somaPR3 / len(precisaoR3)) * 100
    Rrp3 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR3 = 0
    while (g < len(recallR3)):
        somaRR3 = somaRR3 + recallR3[g]
        g = g + 1
    Rrt3 = (somaRR3 / len(recallR3)) * 100
    Rfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR3 = 0
    while (g < len(f1R3)):
        somaFR3 = somaFR3 + f1R3[g]
        g = g + 1
    Rft3 = (somaFR3 / len(f1R3)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp3 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL3 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL3 = 0
    while (g < len(precisaoL3)):
        somaPL3 = somaPL3 + precisaoL3[g]
        g = g + 1
    Lpt3 = (somaPL3 / len(precisaoL3)) * 100
    Lrp3 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL3 = 0
    while (g < len(recallL3)):
        somaRL3 = somaRL3 + recallL3[g]
        g = g + 1
    Lrt3 = (somaRL2 / len(recallL2)) * 100
    Lfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL3 = 0
    while (g < len(f1L3)):
        somaFL3 = somaFL3 + f1L3[g]
        g = g + 1
    Lft3 = (somaFL3 / len(f1L3)) * 100

    ############################ rodada 4
    #print "## RODADA 4 ##"

    training_set = baseDividida1 + baseDividida2 + baseDividida3
    testing_set = baseDividida4

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp4 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB4 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB4 = 0
    while (g < len(precisaoMNB4)):
        somaPMNB4 = somaPMNB4 + precisaoMNB4[g]
        g = g + 1
    MNBpt4 = (somaPMNB4 / len(precisaoMNB4)) * 100
    MNBrp4 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB4 = 0
    while (g < len(recallMNB4)):
        somaRMNB4 = somaRMNB4 + recallMNB4[g]
        g = g + 1
    MNBrt4 = (somaRMNB4 / len(recallMNB4)) * 100
    MNBfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB4 = 0
    while (g < len(f1MNB4)):
        somaFMNB4 = somaFMNB4 + f1MNB4[g]
        g = g + 1
    MNBft4 = (somaFMNB4 / len(f1MNB4)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp4 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR4 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR4 = 0
    while (g < len(precisaoR4)):
        somaPR4 = somaPR4 + precisaoR4[g]
        g = g + 1
    Rpt4 = (somaPR4 / len(precisaoR4)) * 100
    Rrp4 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR4 = 0
    while (g < len(recallR4)):
        somaRR4 = somaRR4 + recallR4[g]
        g = g + 1
    Rrt4 = (somaRR4 / len(recallR4)) * 100
    Rfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR4 = 0
    while (g < len(f1R4)):
        somaFR4 = somaFR4 + f1R4[g]
        g = g + 1
    Rft4 = (somaFR4 / len(f1R4)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp4 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL4 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL4 = 0
    while (g < len(precisaoL4)):
        somaPL4 = somaPL4 + precisaoL4[g]
        g = g + 1
    Lpt4 = (somaPL4 / len(precisaoL4)) * 100
    Lrp4 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL4 = 0
    while (g < len(recallL4)):
        somaRL4 = somaRL4 + recallL4[g]
        g = g + 1
    Lrt4 = (somaRL4 / len(recallL4)) * 100
    Lfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL4 = 0
    while (g < len(f1L4)):
        somaFL4 = somaFL4 + f1L4[g]
        g = g + 1
    Lft4 = (somaFL4 / len(f1L4)) * 100

    ################# medias
    #print "## MEDIA ##"

    #MULTINOMINAL
    MNBmc = (MNBmc1 + MNBmc2 + MNBmc3 + MNBmc4) / 4
    MNBa = (MNBa1 + MNBa2 + MNBa3 + MNBa4) / 4
    MNBamax = max([MNBa1, MNBa2, MNBa3, MNBa4])
    MNBamin = min([MNBa1, MNBa2, MNBa3, MNBa4])
    MNBpp = (MNBpp4 + MNBpp4 + MNBpp4 + MNBpp4) / 4
    MNBpt = (MNBpt1 + MNBpt2 + MNBpt3 + MNBpt4) / 4
    MNBpmax = max([MNBpt1, MNBpt2, MNBpt3, MNBpt4])
    MNBpmin = min([MNBpt1, MNBpt2, MNBpt3, MNBpt4])
    MNBrp = (MNBrp1 + MNBrp2 + MNBrp3 + MNBrp4) / 4
    MNBrt = (MNBrt1 + MNBrt2 + MNBrt3 + MNBrt4) / 4
    MNBrmax = max([MNBrt1, MNBrt2, MNBrt3, MNBrt4])
    MNBrmin = min([MNBrt1, MNBrt2, MNBrt3, MNBrt4])
    MNBfp = (MNBfp1 + MNBfp2 + MNBfp3 + MNBfp4) / 4
    MNBft = (MNBft1 + MNBft2 + MNBft3 + MNBft4) / 4
    MNBfmax = max([MNBft1, MNBft2, MNBft3, MNBft4])
    MNBfmin = min([MNBft1, MNBft2, MNBft3, MNBft4])
    '''
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.set_aspect('equal')
    plt.imshow(MNBmc, interpolation='nearest', cmap=plt.cm.ocean)
    plt.colorbar()
    plt.show()
    '''

    #REGRESSAO LINEAR
    Rmc = (Rmc1 + Rmc2 + Rmc3 + Rmc4) / 4
    Ra = (Ra1 + Ra2 + Ra3 + Ra4) / 4
    Ramax = max([Ra1, Ra2, Ra3, Ra4])
    Ramin = min([Ra1, Ra2, Ra3, Ra4])
    Rpp = (Rpp4 + Rpp4 + Rpp4 + Rpp4) / 4
    Rpt = (Rpt1 + Rpt2 + Rpt3 + Rpt4) / 4
    Rpmax = max([Rpt1, Rpt2, Rpt3, Rpt4])
    Rpmin = min([Rpt1, Rpt2, Rpt3, Rpt4])
    Rrp = (Rrp1 + Rrp2 + Rrp3 + Rrp4) / 4
    Rrt = (Rrt1 + Rrt2 + Rrt3 + Rrt4) / 4
    Rrmax = max([Rrt1, Rrt2, Rrt3, Rrt4])
    Rrmin = min([Rrt1, Rrt2, Rrt3, Rrt4])
    Rfp = (Rfp1 + Rfp2 + Rfp3 + Rfp4) / 4
    Rft = (Rft1 + Rft2 + Rft3 + Rft4) / 4
    Rfmax = max([Rft1, Rft2, Rft3, Rft4])
    Rfmin = min([Rft1, Rft2, Rft3, Rft4])

    #SVC LINEAR
    Lmc = (Lmc1 + Lmc2 + Lmc3 + Lmc4) / 4
    La = (La1 + La2 + La3 + La4) / 4
    Lamax = max([La1, La2, La3, La4])
    Lamin = min([La1, La2, La3, La4])
    Lpp = (Lpp4 + Lpp4 + Lpp4 + Lpp4) / 4
    Lpt = (Lpt1 + Lpt2 + Lpt3 + Lpt4) / 4
    Lpmax = max([Lpt1, Lpt2, Lpt3, Lpt4])
    Lpmin = min([Lpt1, Lpt2, Lpt3, Lpt4])
    Lrp = (Lrp1 + Lrp2 + Lrp3 + Lrp4) / 4
    Lrt = (Lrt1 + Lrt2 + Lrt3 + Lrt4) / 4
    Lrmax = max([Lrt1, Lrt2, Lrt3, Lrt4])
    Lrmin = min([Lrt1, Lrt2, Lrt3, Lrt4])
    Lfp = (Lfp1 + Lfp2 + Lfp3 + Lfp4) / 4
    Lft = (Lft1 + Lft2 + Lft3 + Lft4) / 4
    Lfmax = max([Lft1, Lft2, Lft3, Lft4])
    Lfmin = min([Lft1, Lft2, Lft3, Lft4])
    '''
    print "SVC Linear"
    print "Matriz de confusão: ", Lmc
    print "Acuracia: ", La
    print "Precisão parcial: ", Lpp
    print "Precisão total: ", Lpt
    print "Recall parcial: ", Lrp
    print "Recall total: ", Lrt
    print "F-medida parcial: ", Lfp
    print "F-medida total: ", Lft
    '''

    print(experimento + ':' + str(MNBa) + '\t' + str(Ra) + '\t' + str(La))
    with open(path, mode='w') as csv_file:
        #writer = csv.writer(csv_file)
        csv_file.writelines('Algoritmo' + ';' + 'Multinominal Naïve-Bayes' +
                            '\n')
        csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' +
                            'Precisão parcial' + ';' + 'Precisão total' + ';' +
                            'revocação parcial' + ';' + 'revocação total' +
                            ';' + 'f-medida parcial' + ';' + 'f-medida total' +
                            '\n')
        csv_file.writelines('1;' + str(MNBa1) + ';' + str(MNBpp1) + ';' +
                            str(MNBpt1) + ';' + str(MNBrp1) + ';' +
                            str(MNBrt1) + ';' + str(MNBfp1) + ';' +
                            str(MNBft1) + '\n')
        csv_file.writelines('2;' + str(MNBa2) + ';' + str(MNBpp2) + ';' +
                            str(MNBpt2) + ';' + str(MNBrp2) + ';' +
                            str(MNBrt2) + ';' + str(MNBfp2) + ';' +
                            str(MNBft2) + '\n')
        csv_file.writelines('3;' + str(MNBa3) + ';' + str(MNBpp3) + ';' +
                            str(MNBpt3) + ';' + str(MNBrp3) + ';' +
                            str(MNBrt3) + ';' + str(MNBfp3) + ';' +
                            str(MNBft3) + '\n')
        csv_file.writelines('4;' + str(MNBa4) + ';' + str(MNBpp4) + ';' +
                            str(MNBpt4) + ';' + str(MNBrp4) + ';' +
                            str(MNBrt4) + ';' + str(MNBfp4) + ';' +
                            str(MNBft4) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Total' + '\n')
        csv_file.writelines('Média;' + str(MNBa) + ';' + str(MNBpp) + ';' +
                            str(MNBpt) + ';' + str(MNBrp) + ';' + str(MNBrt) +
                            ';' + str(MNBfp) + ';' + str(MNBft) + '\n')
        csv_file.writelines('Máximo;' + str(MNBamax) + "" + ';' +
                            str(MNBpmax) + "" + ';' + str(MNBrmax) + "" + ';' +
                            str(MNBfmax) + '\n')
        csv_file.writelines('Mínimo;' + str(MNBamin) + "" + ';' +
                            str(MNBpmin) + "" + ';' + str(MNBrmin) + "" + ';' +
                            str(MNBfmin) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Algoritmo' + ';' + 'Regressão Linear' + '\n')
        csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' +
                            'Precisão parcial' + ';' + 'Precisão total' + ';' +
                            'revocação parcial' + ';' + 'revocação total' +
                            ';' + 'f-medida parcial' + ';' + 'f-medida total' +
                            '\n')
        csv_file.writelines('1;' + str(Ra1) + ';' + str(Rpp1) + ';' +
                            str(Rpt1) + ';' + str(Rrp1) + ';' + str(Rrt1) +
                            ';' + str(Rfp1) + ';' + str(Rft1) + '\n')
        csv_file.writelines('2;' + str(Ra2) + ';' + str(Rpp2) + ';' +
                            str(Rpt2) + ';' + str(Rrp2) + ';' + str(Rrt2) +
                            ';' + str(Rfp2) + ';' + str(Rft2) + '\n')
        csv_file.writelines('3;' + str(Ra3) + ';' + str(Rpp3) + ';' +
                            str(Rpt3) + ';' + str(Rrp3) + ';' + str(Rrt3) +
                            ';' + str(Rfp3) + ';' + str(Rft3) + '\n')
        csv_file.writelines('4;' + str(Ra4) + ';' + str(Rpp4) + ';' +
                            str(Rpt4) + ';' + str(Rrp4) + ';' + str(Rrt4) +
                            ';' + str(Rfp4) + ';' + str(Rft4) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Total' + '\n')
        csv_file.writelines('Média;' + str(Ra) + ';' + str(Rpp) + ';' +
                            str(Rpt) + ';' + str(Rrp) + ';' + str(Rrt) + ';' +
                            str(Rfp) + ';' + str(Rft) + '\n')
        csv_file.writelines('Máximo;' + str(Ramax) + "" + ';' + str(Rpmax) +
                            "" + ';' + str(Rrmax) + "" + ';' + str(Rfmax) +
                            '\n')
        csv_file.writelines('Mínimo;' + str(Ramin) + "" + ';' + str(Rpmin) +
                            "" + ';' + str(Rrmin) + "" + ';' + str(Rfmin) +
                            '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Algoritmo' + ';' + 'SVC Linear' + '\n')
        csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' +
                            'Precisão parcial' + ';' + 'Precisão total' + ';' +
                            'revocação parcial' + ';' + 'revocação total' +
                            ';' + 'f-medida parcial' + ';' + 'f-medida total' +
                            '\n')
        csv_file.writelines('1;' + str(La1) + ';' + str(Lpp1) + ';' +
                            str(Lpt1) + ';' + str(Lrp1) + ';' + str(Lrt1) +
                            ';' + str(Lfp1) + ';' + str(Lft1) + '\n')
        csv_file.writelines('2;' + str(La2) + ';' + str(Lpp2) + ';' +
                            str(Lpt2) + ';' + str(Lrp2) + ';' + str(Lrt2) +
                            ';' + str(Lfp2) + ';' + str(Lft2) + '\n')
        csv_file.writelines('3;' + str(La3) + ';' + str(Lpp3) + ';' +
                            str(Lpt3) + ';' + str(Lrp3) + ';' + str(Lrt3) +
                            ';' + str(Lfp3) + ';' + str(Lft3) + '\n')
        csv_file.writelines('4;' + str(La4) + ';' + str(Lpp4) + ';' +
                            str(Lpt4) + ';' + str(Lrp4) + ';' + str(Lrt4) +
                            ';' + str(Lfp4) + ';' + str(Lft4) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Total' + '\n')
        csv_file.writelines('Média;' + str(La) + ';' + str(Lpp) + ';' +
                            str(Lpt) + ';' + str(Lrp) + ';' + str(Lrt) + ';' +
                            str(Lfp) + ';' + str(Lft) + '\n')
        csv_file.writelines('Máximo;' + str(Lamax) + "" + ';' + str(Lpmax) +
                            "" + ';' + str(Lrmax) + "" + ';' + str(Lfmax) +
                            '\n')
        csv_file.writelines('Mínimo;' + str(Lamin) + "" + ';' + str(Lpmin) +
                            "" + ';' + str(Lrmin) + "" + ';' + str(Lfmin) +
                            '\n')

示例#27

0

显示文件

    train="train.txt",
    validation="valid.txt",
    test="valid.txt",
    text_field=TEXT)

print('len(train)', len(train))

TEXT.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))

if False:
    TEXT.build_vocab(train, max_size=1000)
    len(TEXT.vocab)

nltk.download('brown')
brown_1gram = nltk.FreqDist(brown.words())
brown_2gram = nltk.ConditionalFreqDist(nltk.bigrams(brown.words()))
brown_trigrams = nltk.trigrams(brown.words())
condition_pairs = (((w0, w1), w2) for w0, w1, w2 in brown_trigrams)
brown_3gram = nltk.ConditionalFreqDist(condition_pairs)


class NamedBpttIterator(BPTTIterator):
    def __iter__(self):
        text = self.dataset[0].text
        TEXT = self.dataset.fields['text']
        TEXT.eos_token = None
        text = text + ([TEXT.pad_token] * int(
            math.ceil(len(text) / self.batch_size) * self.batch_size -
            len(text)))
        data = TEXT.numericalize([text], device=self.device)

示例#28

0

显示文件

def frequence_words():
    all_words = []
    for w in movie_reviews.words():
        all_words.append(w.lower())
    all_words = nltk.FreqDist(all_words)
    return all_words.most_common(15)

示例#29

0

显示文件

文件： tf-idf.py 项目： tointplace/TextClassification

tf_df = pd.DataFrame(columns=word_list, index = range(len(training_data)))

avg_words = 0
for i in range(len(training_data)):
    if i%1000 == 0:
        print ("iteration", i)
    tokenized_conv = tokenizer.tokenize(training_data.loc[i, 'conversation'])
    tokenized_conv = [word for word in tokenized_conv if word not in cachedStopWords]
    tagText = nltk.pos_tag(tokenized_conv, tagset='universal')
    words = []
    n_words = 0
    for word, pos in tagText:
        if pos == 'NOUN' or pos == 'VERB':
            words.append(word_lemma.lemmatize(word))

    freq = nltk.FreqDist(words)
    tf_entry = pd.DataFrame(columns=word_list)
    for word in word_list:
        if word in freq:
            tf_df.loc[i, word] = freq[word]
            n_words += 1
        else:
            tf_df.loc[i, word] = 0

    avg_words += n_words

avg_words = avg_words/len(training_data)
print(avg_words)
#after reading all conversations
tf_df.to_csv('feature_set_1.csv')

示例#30

0

显示文件

文件： main.py 项目： recallyourname/site-data-analysis

def main(argv=None):
    analyzer = SiteAnalyzer()
    # if argv is None:
    #     argv = sys.argv
    # urls = []
    
    argv = [
        # 'deep',
        '-o',
        'sheet.xlsx',
        'https://www.apptunix.com/',
        'https://trio.dev/',
        'https://youteam.io/',
        'https://www.daxx.com/',
        # 'https://distantjob.com/',
        # 'https://relevant.software/',
        # 'https://remotemore.com/',
        # 'https://www.peerbits.com/',
        # 'https://codersera.com/',
        # 'https://www.makeitinua.com',
        # 'https://soshace.com/',
        # 'https://intersog.com/',
        # 'https://x-team.com/',
        # 'https://www.quickmonday.com/',
        # 'https://hackernoon.com/',
        # 'https://www.classicinformatics.com',
        # 'https://www.remoteco.com/',
        # 'https://remoteplatz.com/',
        # 'https://stormotion.io/',
        # 'https://www.trustshoring.com/',
        'https://geektastic.com/'
    ]
    try:   
        wb = load_workbook(filename = str(argv[argv.index('-o')+1]))

        if '-o' in argv and wb:
            temp_list = []

            for link in argv:
                if analyzer.url_validator(link):
                    temp_list.append(link)

            search_list = []

            if 'deep' in argv:
                for link in temp_list:
                    try:
                        analyzer.parse_page_for_subpages(link, search_list)
                    except HTTPError:
                        pass
            else: 
                search_list = temp_list

            results = {}

            for link in tqdm(search_list):
                if analyzer.url_validator(link) and ".pdf" not in link:
                    print(link)
                    try:
                        _ = analyzer.get_soup(link)
                    except InvalidURL:
                        continue
                    if _ is None:
                        continue
                    try:
                        _ = analyzer.tokenize_soup(_)
                        _ = analyzer.textify_data(_)
                        fd = nltk.FreqDist(_)
                    except TypeError:
                        continue

                    for key, value in dict(fd.most_common(10)).items():
                        if key not in results:
                            results.update({key:value})
                        else:
                            results[key] += value
                        
                    analyzer.add_to_datasheet(wb, input_array)    
                
    except ValueError:
        print("To select output file, please, pass -o a.xlsx file as an argument")
        exit()