import nltk from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords nltk.download('stopwords') textfile = open('NYTimesArticle.txt', mode='r') allwords = textfile.read() print(allwords) tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(allwords.lower()) print(tokens) tokens = [token for token in tokens if token not in stopwords.words('english')] print(tokens) freq_dist = nltk.FreqDist(tokens) print(freq_dist) print(freq_dist.most_common(25)) freq_dist.plot(25)
print(documents[:5]) all_words = [] short_pos_words = word_tokenize(short_pos) short_neg_words = word_tokenize(short_neg) for w in short_pos_words: all_words.append(w.lower()) for w in short_neg_words: all_words.append(w.lower()) print(all_words[:10]) all_words = nltk.FreqDist(all_words) #word_features = [w[0] for w in list(all_words.most_common(5000))] word_features = list(all_words.keys())[:5000] #print(word_features) def find_features(document): words = set(word_tokenize(document)) features = {} for w in word_features: features[w] = (w in words) return features
dd = ', '.join(str(x) for x in g) ''.join(word_list) str1 = ''.join(str(e) for e in word_list) true_k = 5 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=1) x = model.fit(matrix) labels = x.labels_ lowercase = [x.lower() for x in word_list] sents = lowercase print(sents) #wn.wup_similarity(sents, document) from nltk.corpus import brown freqs = nltk.FreqDist(w.lower() for w in sents) print(freqs) word_counter = {} for word in dd: if word in word_counter: word_counter[word] += 1 else: word_counter[word] = 1 popular_words = sorted(word_counter, key=word_counter.get, reverse=True) top_ = popular_words[:100] print(top_) vectorizer = TfidfVectorizer(stop_words='english') print(
def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features
def count_tokens(tokens, n=None): tokens = [t.lower() for t in tokens] freq_counter = nltk.FreqDist(tokens) return freq_counter.most_common(n)
# Find most common suffixes from nltk.corpus import brown import nltk from pprint import pprint suffix_fdist = nltk.FreqDist() for word in brown.words(): word = word.lower() suffix_fdist[word[-2:]] += 1 suffix_fdist[word[-3:]] += 1 suffix_fdist[word[-1:]] += 1 # Top 100 most common suffixes common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)] pprint(common_suffixes) # Create feature extraction function with common suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith({})'.format(suffix)] = word.lower().endswith(suffix) return features # Create Decision Tree Classifier to extract pos tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size]
# Transform the authors' corpora into lists of word tokens federalist_by_author_tokens = {} federalist_by_author_length_distributions = {} for author in authors: tokens = nltk.word_tokenize(federalist_by_author[author]) # Filter out punctuation federalist_by_author_tokens[author] = ([ token for token in tokens if any(c.isalpha() for c in token) ]) # Get a distribution of token lengths token_lengths = [ len(token) for token in federalist_by_author_tokens[author] ] federalist_by_author_length_distributions[author] = nltk.FreqDist( token_lengths) federalist_by_author_length_distributions[author].plot(15, title=author) # Who are the authors we are analyzing? authors = ("Hamilton", "Madison") # Lowercase the tokens so that the same word, capitalized or not, # counts as one word for author in authors: federalist_by_author_tokens[author] = ([ token.lower() for token in federalist_by_author_tokens[author] ]) federalist_by_author_tokens["Disputed"] = ([ token.lower() for token in federalist_by_author_tokens["Disputed"] ])
def generate_data(): vocabulary_size = 250 unknown_token = "UNKNOWN_TOKEN" word_dim = 3 print("Reading CSV file...") with open('raw_sentences.txt', 'r') as f: reader = csv.reader(f, skipinitialspace=True, delimiter='\n') # Split full comments into sentences sentences = itertools.chain( *[nltk.sent_tokenize(x[0].lower()) for x in reader]) sentences = [' '.join(x.split()) for x in sentences ] # ['no , he says now .', 'and what did he do ?',, ...] print("Parsed %d sentences." % (len(sentences))) # Tokenize the sentences into words(문장을 각각의 단어로 분할) tokenized_sentences = [ nltk.word_tokenize(sent) for sent in sentences ] # [['no', ',', 'he', 'says', 'now', '.'], ['and', 'what', 'did', 'he', 'do', '?'], ...] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print("Found %d unique words tokens." % len(word_freq.items())) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common( vocabulary_size - 1) # vocab <--- list [('.', 80974), ('it', 29200), (',', 24583), ...] index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) print("Using vocabulary size %d." % vocabulary_size) print( "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in word_to_index else unknown_token for w in sent ] print("\nExample sentence: '%s'" % sentences[0]) print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]) indexed_sentences = [[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences] all_data = [] for sen in indexed_sentences: if len(sen) >= word_dim + 1: for i in range(len(sen) - word_dim): all_data.append(sen[i:i + word_dim + 1]) all_data = np.array(all_data, dtype=np.int16) np.random.shuffle(all_data) data = { 'data': all_data, 'word_to_index': word_to_index, 'index_to_word': index_to_word, 'vocab': vocab } with open('data.pickle', 'wb') as outfile: pickle.dump(data, outfile)
def student_submitanswer(request): logger.debug("in student_submitanswer()") print("in student_submitanswer()") alternative = False alternative_accepted = False response_data = {'state': 'failure'} proceed_further = False student, res = getSpByRequest(request, 'login') proceed_further = False if not student and res: return res try: qid = request.POST.get('questionid') question = Question.objects.get(id=qid) stdanswer = question.stdanswer # get the alternative standard answer for the answer if question.alt_stdanswer: alt_stdanswer = question.alt_stdanswer alternative = True else: alt_stdanswer = None except (Exception) as e: print e logger.error("question %s not exists" % qid) print("question %s does not exists" % qid) return HttpResponse(simplejson.dumps(response_data), mimetype="application/json") try: answer_html = request.POST.get('answer_html') logger.info('this is the answer: %s ' % answer_html) try: answer_html = answer_html.decode("utf8").encode('ascii', 'ignore') except: try: answer_html = answer_html.encode('ascii', 'ignore') except: pass anstext = stripHTMLStrings(strip_tags(answer_html)) try: anstext = anstext.decode("utf8").encode('ascii', 'ignore') except: try: anstext = anstext.encode('ascii', 'ignore') except: import traceback traceback.print_exc() stuanswer = StudentAnswer.objects.filter( question=question, student=student).latest('timestamp') stuanswer.html_answer = answer_html stuanswer.save() print "----------------------------------------------------------------------" print anstext # stuanswer.html_answer = answer_html # stuanswer.txt_answer = anstext # stuanswer.save() print "answer saveddddddddddddddddddd done" except Exception as e: import traceback print 111111111111111111111111111111, traceback.format_exc() traceback.format_exc() logger.error("cant find stuanswer for question %s" % question) logger.error(str(traceback.format_exc())) return HttpResponse(simplejson.dumps(response_data), mimetype="application/json") try: thumbnail_ids = [ int(i) for i in request.POST['stuthumbnail_ids'].split(',') if i ] print 'thumbnail_ids@@@@@@@@@ = ', thumbnail_ids except: import traceback traceback.format_exc() thumbnail_ids = [] logger.debug("no img for question %s" % question) pass #stdanswer algorithm to mark stuanswer if not stdanswer or not stuanswer: return HttpResponse(simplejson.dumps(response_data), mimetype="application/json") else: textfdist = _loadlist(stdanswer.textfdist) slist = _loadlist(stdanswer.sentencelist) pointlist = _loadlist(stdanswer.pointlist) rulelist = _loadlist(stdanswer.rulelist) print textfdist print slist print pointlist print rulelist # for alternate answers if alternative and alt_stdanswer: alt_textfdist = _loadlist(alt_stdanswer.textfdist) alt_slist = _loadlist(alt_stdanswer.sentencelist) alt_pointlist = _loadlist(alt_stdanswer.pointlist) alt_rulelist = _loadlist(alt_stdanswer.rulelist) else: alt_textfdist = None alt_slist = None alt_pointlist = None alt_rulelist = None print alt_textfdist print alt_slist print alt_pointlist print alt_rulelist # TODO: add better handling so that progress bar doesn't get stuck when algorithm code has an exception try: ans = Answer() # initialize for alternative answer if alternative: alt_ans = Answer() else: alt_ans = None if USE_STUDENT_TEXT_DIST: ans_textfdist = get_text_distribution(anstext) if ans_textfdist: textfdist = ans_textfdist # save the same to alt_textfdist if alternative: alt_textfdist = ans_textfdist else: alt_textfdist = None if not textfdist: textfdist = nltk.FreqDist(['test']) # for alternate answer if not alt_textfdist: alt_textfdist = nltk.FreqDist(['test']) print ans.Analysis(anstext, textfdist, slist, pointlist, rulelist) mark, marklist, omitted, closeness_stats = ans.Analysis( anstext, textfdist, slist, pointlist, rulelist) if alternative: # calculate the same with alternate standard answer alt_mark, alt_marklist, alt_omitted, alt_closeness_stats = alt_ans.Analysis( anstext, alt_textfdist, alt_slist, alt_pointlist, alt_rulelist) else: alt_mark = alt_marklist = alt_omitted = alt_closeness_stats = None try: stucanvaslist = Canvas.objects.filter(question=question, stuanswer=stuanswer) canvasmark = sum(stucanvas.mark for stucanvas in stucanvaslist) print 'canvasmark = ', canvasmark except Exception, e: import traceback traceback.format_exc() logger.error(e) canvasmark = 0 # save mark result print '\n##############################################' * 2 print 'thumbnail_ids = ', thumbnail_ids imgmark, stuansimages = __getimgmark(thumbnail_ids, question) print 'imgmark = ', imgmark # print 'stuansimages = ', stuansimages print '\n##############################################' * 2 if not mark or not marklist: mark = 0 marklist = list() if not omitted: omitted = list() if not alt_mark or not alt_marklist: alt_mark = 0 alt_marklist = list() if not alt_omitted: alt_omitted = list() # Include optional listing with results from external grammar checker and optional closeness summarization grammar_issues = ans.critique_results[ 'report'] if ans.critique_results else "" closeness = ans.closeness if ans.closeness else 0.0 # Alernative answer if alternative: alt_grammar_issues = alt_ans.critique_results[ 'report'] if alt_ans.critique_results else "" alt_closeness = alt_ans.closeness if alt_ans.closeness else 0.0 # Apply min closeness band threshold for mark if (question.min_closeness_band > 0): band = int(closeness * NUM_CLOSENESS_BANDS - 0.001) if (band < question.min_closeness_band): logger.info( "Zeroing mark (%s) as closeness band (%s) less then min (%s)" % (mark, band, question.min_closeness_band)) mark = 0 if not mark and alternative: print "inside alternative marking analysis" band = int(alt_closeness * NUM_CLOSENESS_BANDS - 0.001) if (band < question.min_closeness_band): alt_mark = 0 if (stuanswer.mark <= mark + imgmark + canvasmark) or (stuanswer.mark <= alt_mark): proceed_further = True stuanswer.html_answer = answer_html stuanswer.txt_answer = anstext stuanswer.save() print 'mark = ', mark, '\n' print 'marklist = ', marklist, '\n' print 'omitted = ', omitted, '\n' print 'closeness_stats = ', closeness_stats, '\n' print 'alt_mark = ', alt_mark, '\n' print 'alt_marklist = ', alt_marklist, '\n' print 'alt_omitted = ', alt_omitted, '\n' print 'alt_closeness_stats = ', alt_closeness_stats, '\n'
text = re.sub(cleanit, '', raw) return text f = open('wiki_00', "r", encoding="utf8") raw = f.read() raw = cleanhtmlfun(raw) from nltk.tokenize import TreebankWordTokenizer tbw = TreebankWordTokenizer() tokens = tbw.tokenize(raw) tokens = [''.join(c for c in s if c not in string.punctuation) for s in tokens] tokens = [s for s in tokens if s] trigrams = nltk.ngrams(tokens, 3) fdist_trigrams = nltk.FreqDist(trigrams) unique_trigrams = fdist_trigrams.B( ) #This gives the total number of unique trigrams import matplotlib.pyplot as plt Y = fdist_trigrams.values() Y = sorted(Y, reverse=True) X = range(len(Y)) plt.figure() plt.loglog(X, Y) plt.xlabel('Trigram') plt.ylabel('Frequency') plt.title('Trigram Frequencies') plt.grid() plt.show()
reader.next() # Split full comments into sentences sentences = itertools.chain( *[nltk.sent_tokenize(x[0].lower()) for x in reader]) # Append SENTENCE_START and SENTENCE_END sentences = [ "%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences ] print("Parsed %d sentences." % (len(sentences))) # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print("Found %d unique words tokens." % len(word_freq.items())) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(vocabulary_size - 1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) print("Using vocabulary size %d." % vocabulary_size) print( "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences):
import nltk # nltk.download('brown') from nltk.corpus import brown print(brown.categories()) genres = ['fiction', 'humor', 'romance'] whwords = ['what', 'which', 'how', 'why', 'when', 'where', 'who'] for i in range(0, len(genres)): genre = genres[i] print() print("Analysing '" + genre + "' wh words") genre_text = brown.words(categories=genre) fdist = nltk.FreqDist(genre_text) for wh in whwords: print(wh + ':', fdist[wh], end=' ')
encoding='utf-8') #movies.info() sample = movies.loc[:, [ 'Title', 'Movie_ID', 'Synopsis', 'Genre1', 'Genre2', 'Genre3' ]] train = sample cols = ['Genre1', 'Genre2', 'Genre3'] train['Genre'] = list(train[cols].apply( lambda x: ','.join(x.dropna()).split(','), axis=1)) train.drop(['Genre1', 'Genre2', 'Genre3'], axis=1, inplace=True) #len(train) #train all_genres = sum(train.Genre, []) #len(set(all_genres)) all_genres = nltk.FreqDist(all_genres) # 5-Genres all_genres_df = pd.DataFrame({ 'Genre': list(all_genres.keys()), 'Count': list(all_genres.values()) }) all_genres_df.groupby(by='Genre').sum().sort_values('Count', ascending=False) g = all_genres_df.nlargest(columns="Count", n=50) plt.figure(figsize=(12, 15)) ax = sns.barplot(data=g, x="Count", y="Genre") ax.set(title='Summary of Genre Distribution', ylabel='Genres') plt.show() filters = [ gsp.strip_tags, gsp.strip_punctuation, gsp.strip_multiple_whitespaces,
#make the query results looks like {"h": ["happy", "had"], "b": ["ball", "bat"]} query_result = get_query_result(query_terms) query_result_articleonly = [] for a_page in query_result['query']['pages']: query_result_articleonly.append( query_result['query']['pages'][a_page]['extract']) results = filter_tags("".join(query_result_articleonly)) tagged_sent = pos_tag(results.split()) #print(tagged_sent) adjs = [ word for word, pos in tagged_sent if pos == 'JJ' or pos == 'JJR' or pos == "JJS" ] #print(adjs) freq_adjs = nltk.FreqDist(adjs) freq_pair_list = freq_adjs.most_common(6) freq_adj_list = [] for a_freq_adj in freq_pair_list: freq_adj_list.append(a_freq_adj[0]) #print(freq_adj_list) freq_adj_firs = [] for a_fir in freq_adj_list: freq_adj_firs.append(a_fir[0]) freq_adj_firs = set(freq_adj_firs) word_dict = {} for a_fir in freq_adj_firs: alist = [] for a_word in freq_adj_list: if a_fir == a_word[0]: alist.append(a_word)
def frequence_specific_word(text): all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) return all_words[text]
len(all_text) # clean the text my_clean_text = clean_text(all_text) len(my_clean_text) # get the tokens of the cleaned text tokens = word_tokenize(my_clean_text, 'portuguese') len(tokens) # remove stopwords from the tokens tokens_without_sw = remove_stopwords(tokens) len(tokens_without_sw) # calculate the frequency of the tokens freq = nltk.FreqDist(tokens_without_sw) # plot word x count fig = plt.figure(figsize=[10, 5]) freq.plot(20, cumulative=False) # plot the frequency plt.title('Palavras mais frequentes', fontsize=14) plt.xlabel('Palavra', fontsize=14) plt.xticks(rotation=90) plt.ylabel('Contagem', fontsize=14) plt.tight_layout() # save fig path_save_fig = folder_save_fig + 'paravras_mais_frequentes_google.png' plt.savefig(path_save_fig) # list with the 100 more common words
def select_best_keywords(metadata_table): table_to_process = metadata_table[["pr_title", "pr_abstract"]] table_to_process["pr_title"] = table_to_process["pr_title"].apply( lambda x: remove_stop_words(x)) table_to_process["pr_abstract"] = table_to_process["pr_abstract"].apply( lambda x: remove_stop_words(x)) print("Text Data after removing of stop-words") display(table_to_process) words_corpus = get_words_corpus(table_to_process) print(len(words_corpus)) dist = nltk.FreqDist( words_corpus) # Creating a distribution of words' frequencies grams = dist.most_common(1000) # Obtaining the most frequent words bigrams = nltk.collocations.BigramAssocMeasures() trigrams = nltk.collocations.TrigramAssocMeasures() bigramFinder = nltk.collocations.BigramCollocationFinder.from_words( words_corpus) trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words( words_corpus) print("Showing first", 2000, "top-freqent words in the corpus") grams = pd.DataFrame(grams) grams.index = range(1, len(grams) + 1) grams.columns = ["Word", "Frequency"] display(grams) bi_filter = 7 print( "Showing bigrams in the corpus found by Pointwise Mutual Information method" ) print("Applying frequency filter: a bigramm occurs more than", bi_filter, "times") bigramFinder.apply_freq_filter(bi_filter) bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram', 'PMI']).sort_values(by='PMI', ascending=False) bigramPMITable["bigram"] = bigramPMITable["bigram"].apply( lambda x: ' '.join(x)) display(bigramPMITable) tri_filter = 5 print( "Showing trigrams in the corpus found by Pointwise Mutual Information method" ) print("Applying frequency filter: a trigramm occurs more than", tri_filter, "times") trigramFinder.apply_freq_filter(tri_filter) trigramPMITable = pd.DataFrame( list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram', 'PMI']).sort_values(by='PMI', ascending=False) trigramPMITable["trigram"] = trigramPMITable["trigram"].apply( lambda x: ' '.join(x)) display(trigramPMITable) gram_dict = grams.set_index('Word').T.to_dict('list') bigramPMIDict = bigramPMITable.set_index('bigram').T.to_dict('list') trigramPMIDict = trigramPMITable.set_index('trigram').T.to_dict('list') keyword_processor = KeywordProcessor() textrank_keyword_processor = KeywordProcessor() gram_dict.update(bigramPMIDict) bigramPMIDict.update(trigramPMIDict) # print(gram_dict) print( "Extracting keywords from texts using Pointwise Mutual Information method and TextRank" ) text_rank_key_words = dict() for i in range(0, len(table_to_process)): sentences = table_to_process.loc[i, "pr_abstract"] if sentences != None: keywords = get_keywords_by_textrank(sentences) if keywords != None: text_rank_key_words.update(keywords) print("Text", i, "- Done") for i in range(0, len(table_to_process)): sentences = table_to_process.loc[i, "pr_title"] if sentences != None: keywords = get_keywords_by_textrank(sentences) if keywords != None: text_rank_key_words.update(keywords) print("Text", i, "- Done") for keyword in gram_dict.keys(): parts = keyword.split() parts = "_".join(parts) keyword_processor.add_keyword(keyword, parts) for keyword in text_rank_key_words.keys(): parts = keyword.split() parts = "_".join(parts) textrank_keyword_processor.add_keyword(keyword, parts) print(len(keyword_processor.get_all_keywords())) print(len(textrank_keyword_processor.get_all_keywords())) print(len(text_rank_key_words)) table_to_process["pr_abstract"] = table_to_process["pr_abstract"].apply( lambda x: merge_two_keywords_methods(x, textrank_keyword_processor, keyword_processor)) table_to_process["pr_title"] = table_to_process["pr_title"].apply( lambda x: merge_two_keywords_methods(x, textrank_keyword_processor, keyword_processor)) for i in range(0, len(table_to_process)): metadata_table.loc[i, "pr_title"] = table_to_process.loc[i, "pr_title"] metadata_table.loc[i, "pr_abstract"] = table_to_process.loc[i, "pr_abstract"] print( "Comparison of Text Data after Keywords Extraction using Pointwise Mutual Information method and TextRank" ) display(metadata_table[["title", "pr_title", "abstract", "pr_abstract"]]) print("Extracting keywords from texts using TF/IDF") dataset = [] for i in range(0, len(table_to_process["pr_abstract"])): sentences = table_to_process.loc[i, "pr_abstract"] if sentences != None: sentences = " ".join(sentences) dataset.append(sentences) tfIdfVectorizer = TfidfVectorizer(use_idf=True) tfIdf = tfIdfVectorizer.fit_transform(dataset) index = 0 for i in range(0, len(metadata_table)): if table_to_process.loc[i, "pr_abstract"] == None: continue metadata_table.loc[i, "pr_abstract"] = retain_best_tf_idf_keywords( table_to_process.loc[i, "pr_abstract"], index, tfIdf, tfIdfVectorizer) index += 1 print("Extracting keywords from texts using TF/IDF") dataset = [] for i in range(0, len(table_to_process["pr_title"])): sentences = table_to_process.loc[i, "pr_title"] if sentences != None: sentences = " ".join(sentences) dataset.append(sentences) tfIdfVectorizer = TfidfVectorizer(use_idf=True) tfIdf = tfIdfVectorizer.fit_transform(dataset) index = 0 for i in range(0, len(metadata_table)): if table_to_process.loc[i, "pr_title"] == None: continue metadata_table.loc[i, "pr_title"] = retain_best_tf_idf_keywords( table_to_process.loc[i, "pr_title"], index, tfIdf, tfIdfVectorizer) index += 1 return metadata_table
def named_entity_recog(x): import nltk return nltk.ne_chunk([x]) NER_word = filtered_data.map(named_entity_recog) print(NER_word.collect()) #Stemming and Lemmatization nltk.download('wordnet') def lemma(x): import nltk from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() return lemmatizer.lemmatize(x) lem_words = filtered_data.map(lemma) print(lem_words.collect()) #Text Classification . #find the words which has the highest frequency and sort them in decreasing order of their frequency. text_Classifi = filtered_data.flatMap( lambda x: nltk.FreqDist(x.split(",")).most_common()).map( lambda x: x).reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[1], ascending=False) topcommon_data = text_Classifi.take(100) #take first 100 most common words topcommon_data
def rmp_sentment_analysis(src): df = pd.read_csv( src, usecols=['professor_name', 'school_name', 'star_rating', 'comments']) high_professor_comment = df[(df['star_rating'] >= 4.0) & ( df['star_rating'] <= 5.0)]['comments'].sample(10000).dropna().tolist() text = ' '.join(high_professor_comment) tokens = [t.lower() for t in re.split(r'[^\w\s]|\s', text) if t != ''] print(tokens) sr = stopwords.words('english') add_stopword = ['him.', 'took', 'one', 'took', 'day'] sr = sr + add_stopword clean_tokens = tokens[:] for token in tokens: if token in stopwords.words('english'): clean_tokens.remove(token) freq = nltk.FreqDist(clean_tokens) for key, val in freq.items(): print(str(key) + ':' + str(val)) freq.plot(20, cumulative=False) # https://github.com/cjhutto/vaderSentiment#about-the-scoring from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyzer = SentimentIntensityAnalyzer() # 高分教授的情感 high_professor_sentiment = [] pos_num = 0 neg_num = 0 for sentence in high_professor_comment: vs = analyzer.polarity_scores(sentence) # print("{:-<65} {}".format(sentence, str(vs))) # print(vs['compound']) high_professor_sentiment.append(vs['compound']) if vs['compound'] >= 0.05: print('positive') pos_num += 1 else: print('negtive') neg_num += 1 print(pos_num, neg_num) ratio = pos_num / (pos_num + neg_num) print('ratio:', ratio) high_mean = np.mean(high_professor_sentiment) high_std = np.std(high_professor_sentiment) print('平均数:', high_mean, '标准差', high_std) low_professor_comment = df[(df['star_rating'] >= 1.0) & ( df['star_rating'] <= 2.0)]['comments'].sample(10000).dropna().tolist() # 低分教授 print('低分教授') low_professor_sentiment = [] low_pos_num = 0 low_neg_num = 0 for sentence in low_professor_comment: vs = analyzer.polarity_scores(sentence) # print("{:-<65} {}".format(sentence, str(vs))) # print(vs['compound']) low_professor_sentiment.append(vs['compound']) if vs['compound'] <= -0.05: # print('positive') low_pos_num += 1 else: # print('negtive') low_neg_num += 1 print(low_pos_num, low_neg_num) low_ratio = low_pos_num / (low_pos_num + low_neg_num) print('ratio:', low_ratio) low_mean = np.mean(low_professor_sentiment) low_std = np.std(low_professor_sentiment) print('平均数:', low_mean, '标准差', low_std)
def top10POS(tokensPOS): seqPOS = estraiSeqPOS(tokensPOS) freqDist = nltk.FreqDist(seqPOS) return freqDist.most_common(10)
len(satoshi_nakamoto_tokens) / len(sentence))) # entity extraction satoshi = nlp(satoshi_nakamoto_text) entity_list = [ 'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'LAW' ] satoshi_list = [] for i in range(len(satoshi)): if satoshi[i].ent_type_ in entity_list: satoshi_list.append(satoshi[i].text) print("entity extration: {}".format(satoshi_list)) # noun chunk noun_chunk = [] for chunk in satoshi.noun_chunks: noun_chunk.append(chunk.text) print("noun_chunk: {}".format(noun_chunk)) # n gram import nltk tokens = nltk.tokenize.word_tokenize(satoshi_nakamoto_text) bgs = nltk.ngrams(tokens, 3) fdist = nltk.FreqDist(bgs) print(fdist.most_common(30))
def top20Tokens(tokens1): esclusi = [',', '.', ':', ';'] #non contenenti punteggiatura nonPunteggiatura = [a[0] for (a, b) in tokens1 if not (a[1] in esclusi)] fdist = nltk.FreqDist(nonPunteggiatura) return fdist.most_common(20)
# -*- coding: utf-8 -*- import nltk from nltk.corpus import gutenberg macbeth = gutenberg.words("shakespeare-macbeth.txt") stopwords = set(nltk.corpus.stopwords.words()) fd = nltk.FreqDist([ w for w in macbeth if w.lower() not in stopwords and len(w) > 3 and w.isalpha() ]) d = list(fd.keys()) print(d[0:50])
def main(): if not len(sys.argv[1:]): usage() # nomi dei file passati come argomenti file1 = sys.argv[1] file2 = sys.argv[2] print '-' * 80 print " Progetto di Linguistica Computazionale" print " Programma 2 " print '-' * 80 tokensText1, frasiMarkov1, frasi1 = openTextFile( file1) # tokens del testo, frasi per es. Markov tokensText2, frasiMarkov2, frasi2 = openTextFile(file2) tokensPOS1, namedEntityDict1 = analisiLing( frasi1) # processo di annotazione ed NE tokensPOS2, namedEntityDict2 = analisiLing(frasi2) top10POS1 = top10POS(tokensPOS1) #10 PoS più frequenti top10POS2 = top10POS(tokensPOS2) print '-' * 80 print " 10 PoS (Part-of-Speech) più frequenti" print '-' * 80 print ' Testo:', file1, ' ' * 12, '| Testo:', file2 print '-' * 80 for i in range(min(len(top10POS1), len(top10POS2))): # stampa il confronto pos1, freq1 = top10POS1[i] pos2, freq2 = top10POS2[i] print " {: <20}{: <12}| {: <20}{: <12}".format(pos1, freq1, pos2, freq2) print '-' * 80 print " 20 Token più frequenti" print '-' * 80 print ' Testo:', file1, ' ' * 12, '| Testo:', file2 print '-' * 80 topTokens1 = top20Tokens(getBigrams(tokensPOS1)) #20 Token più frequenti topTokens2 = top20Tokens(getBigrams(tokensPOS2)) for i in range(min(len(topTokens1), len(topTokens2))): # stampa il confronto tok1, freq1 = topTokens1[i] tok2, freq2 = topTokens2[i] print " {: <25}{: <7}| {: <25}{: <7}".format(tok1.encode('utf-8'), freq1, tok2.encode('utf-8'), freq2) print '-' * 80 print '-' * 80 print " 20 Bigrammi più frequenti" print '-' * 80 print ' Testo:', file1, ' ' * 12, '| Testo:', file2 print '-' * 80 bigrammi1 = getBigrams(tokensPOS1) # Bigrammi del testo annotati bigrammi2 = getBigrams(tokensPOS2) topBigrams1 = top20Bigrams(bigrammi1) #20 Bigrammi più frequenti topBigrams2 = top20Bigrams(bigrammi2) for i in range(min(len(topBigrams1), len(topBigrams2))): # stampa il confronto big1, freq1 = topBigrams1[i] big2, freq2 = topBigrams2[i] print " {: <25}{: <7}| {: <25}{: <7}".format( big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), freq1, big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), freq2) print '-' * 80 print " 20 Trigrammi più frequenti" print '-' * 80 print ' Testo:', file1, ' ' * 18, '| Testo:', file2 print '-' * 80 topTrigrams1 = top20Trigrams( getTrigrams(tokensPOS1)) #20 Trigrammi più frequenti topTrigrams2 = top20Trigrams(getTrigrams(tokensPOS2)) for i in range(min(len(topTrigrams1), len(topTrigrams2))): # stampa il confronto tri1, freq1 = topTrigrams1[i] tri2, freq2 = topTrigrams2[i] print " {: <35}{: <3}| {: <35}{: <3}".format( tri1[0].encode('utf-8') + " " + tri1[1].encode('utf-8') + " " + tri1[2].encode('utf-8'), freq1, tri2[0].encode('utf-8') + " " + tri2[1].encode('utf-8') + " " + tri2[2].encode('utf-8'), freq2) print '-' * 80 print '-' * 80 print " 20 Bigrammi - Aggettivo e Sostantivo" print '-' * 80 setBigramsAggSost1 = bigrammiAggSos( bigrammi1) # Set di Bigrammi: aggettivo , sostantivo setBigramsAggSost2 = bigrammiAggSos(bigrammi2) vocabFreq1 = dictVocabFreq( set(tokensText1), tokensText1) # dizionario con tokens e frequenze vocabFreq2 = dictVocabFreq(set(tokensText2), tokensText2) bigrams1 = getBigrams(tokensText1) # Bigrammi del testo non annotati bigrams2 = getBigrams(tokensText2) dictBigrammi1 = infoBigrams( bigrams1, setBigramsAggSost1, vocabFreq1 ) # dizionario con info bigrammi_AggSost : (F(u,v), F(u), F(v), P(v|u), P(u,v)) dictBigrammi2 = infoBigrams(bigrams2, setBigramsAggSost2, vocabFreq2) forzaAssoc1 = getLocalMutualInformation( dictBigrammi1, vocabFreq1) # Local Mutual Information forzaAssoc2 = getLocalMutualInformation(dictBigrammi2, vocabFreq2) print " Con probabilità congiunta massima P(u,v):" print ' Testo:', file1, ' ' * 12, '| Testo:', file2 print '-' * 80 probCongiunta1 = ordinaProbCongiunta(dictBigrammi1)[:20] probCongiunta2 = ordinaProbCongiunta(dictBigrammi2)[:20] for i in range(min(len(probCongiunta1), len(probCongiunta2))): # stampa il confronto big1, p1 = probCongiunta1[i] big2, p2 = probCongiunta2[i] p1 = Decimal(str(p1[4])).quantize(Decimal('.00001'), rounding=ROUND_DOWN) p2 = Decimal(str(p2[4])).quantize(Decimal('.00001'), rounding=ROUND_DOWN) print " {: <25}{: <7}| {: <25}{: <7}".format( big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), p1, big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), p2) print '-' * 80 print " Con probabilità condizionata massima P(v|u):" print ' Testo:', file1, ' ' * 12, '| Testo:', file2 print '-' * 80 probCondizionata1 = ordinaProbCondizionata(dictBigrammi1)[:20] probCondizionata2 = ordinaProbCondizionata(dictBigrammi2)[:20] for i in range(min(len(probCondizionata1), len(probCondizionata2))): # stampa il confronto big1, p1 = probCondizionata1[i] big2, p2 = probCondizionata2[i] p1 = Decimal(str(p1[3])).quantize(Decimal('.0001'), rounding=ROUND_DOWN) p2 = Decimal(str(p2[3])).quantize(Decimal('.0001'), rounding=ROUND_DOWN) print " {: <25}{: <7}| {: <25}{: <7}".format( big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), p1, big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), p2) print '-' * 80 print " Con forza associativa massima (LMI):" print ' Testo:', file1, ' ' * 12, '| Testo:', file2 print '-' * 80 topLMI1 = ordinaDict(forzaAssoc1)[:20] topLMI2 = ordinaDict(forzaAssoc2)[:20] for i in range(min(len(topLMI1), len(topLMI2))): # stampa il confronto big1, lmi1 = topLMI1[i] big2, lmi2 = topLMI2[i] lmi1 = Decimal(str(lmi1)).quantize(Decimal('.001'), rounding=ROUND_DOWN) lmi2 = Decimal(str(lmi2)).quantize(Decimal('.001'), rounding=ROUND_DOWN) print " {: <25}{: <7}| {: <25}{: <7}".format( big1[0].encode('utf-8') + " " + big1[1].encode('utf-8'), lmi1, big2[0].encode('utf-8') + " " + big2[1].encode('utf-8'), lmi2) print '-' * 80 print '-' * 80 print " Le due frasi con probabilità più alta" print '-' * 80 distrFreq1 = nltk.FreqDist(tokensText1) topFrase1, probFraseMax1 = maxProbMarkov0( len(tokensText1), distrFreq1, frasiMarkov1) # frase Markov 0 con probabilità più alta print " 1° Frase calcolata attraverso un modello di Markov di ordine 0:" print '-' * 80 print ' Testo:', file1 print ' "', " ".join(topFrase1).encode('utf-8'), '"' print " Probabilità:", probFraseMax1 print distrFreq2 = nltk.FreqDist(tokensText2) topFrase2, probFraseMax2 = maxProbMarkov0( len(tokensText2), distrFreq2, frasiMarkov2) # frase Markov 0 con probabilità più alta print ' Testo:', file2 print ' "', " ".join(topFrase2).encode('utf-8'), '"' print " Probabilità:", probFraseMax2 print '-' * 80 infoBigrammi1 = infoBigrams( bigrams1, set(bigrams1), vocabFreq1 ) # dizionario con info bigrammi : (F(u,v), F(u), F(v), P(v|u), P(u,v)) infoBigrammi2 = infoBigrams(bigrams2, set(bigrams2), vocabFreq2) topFrase1, probFraseMax1 = maxProbMarkov1( len(tokensText2), frasiMarkov1, infoBigrammi1) # frase Markov 1 con probabilità più alta print " 2° Frase calcolata attraverso un modello di Markov di ordine 1:" print '-' * 80 print ' Testo:', file1 print ' "', " ".join(topFrase1).encode('utf-8'), '"' print " Probabilità:", probFraseMax1 print distrFreq2 = nltk.FreqDist(tokensText2) topFrase2, probFraseMax2 = maxProbMarkov1( len(tokensText2), frasiMarkov2, infoBigrammi2) # frase Markov 1 con probabilità più alta print ' Testo:', file2 print ' "', " ".join(topFrase2).encode('utf-8'), '"' print " Probabilità:", probFraseMax2 print '-' * 80 print '-' * 80 print " 20 nomi propri di persona più frequenti" print ' Testo:', file1, ' ' * 18, '| Testo:', file2 print '-' * 80 topPerson1 = nltk.FreqDist(namedEntityDict1["PERSON"]).most_common( 20) #i 20 nomi propri di persona più frequenti topPerson2 = nltk.FreqDist(namedEntityDict2["PERSON"]).most_common(20) for i in range(min(len(topPerson1), len(topPerson2))): # stampa il confronto tok1, freq1 = topPerson1[i] tok2, freq2 = topPerson2[i] print " {: <35}{: <3}| {: <35}{: <3}".format(tok1.encode('utf-8'), freq1, tok2.encode('utf-8'), freq2) print '-' * 80 print " 20 nomi propri di luogo più frequenti" print ' Testo:', file1, ' ' * 18, '| Testo:', file2 print '-' * 80 topGpe1 = nltk.FreqDist(namedEntityDict1["GPE"]).most_common( 20) #i 20 nomi propri di luogo più frequenti topGpe2 = nltk.FreqDist(namedEntityDict2["GPE"]).most_common(20) for i in range(min(len(topGpe1), len(topGpe2))): # stampa il confronto tok1, freq1 = topGpe1[i] tok2, freq2 = topGpe2[i] print " {: <35}{: <3}| {: <35}{: <3}".format(tok1.encode('utf-8'), freq1, tok2.encode('utf-8'), freq2) print '-' * 80 sys.exit(2)
# clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) #tokens=[word for word in tokens if not word in stopwords.words()] words = [word for word in tokens if word.isalpha()] # remove stop words from tokens stopped_tokens = [ i for i in words if (not i in en_stop and len(str(i)) > 2) ] texts.append(stopped_tokens) for word in stopped_tokens: cnt[word] += 1 #Create your bigrams bgs = nltk.bigrams(stopped_tokens) #compute frequency distribution for all the bigrams in the text fdist = nltk.FreqDist(bgs) for k, v in fdist.items(): cnt2[k] += 1 fdist = nltk.FreqDist(nltk.trigrams(stopped_tokens)) for k, v in fdist.items(): cnt3[k] += 1 freq = cnt + cnt2 + cnt3 complete_data = [] for value, count in freq.most_common(): complete_data.append([value, count]) writer = pd.DataFrame(complete_data, columns=['Keywords', 'Frequency']) writer.to_csv("FrequencyUniBiGram_Care_Products.csv", index=None, header=True)
def executar(experimento, nome_Base, acento): nomeBase = nome_Base path = experimento + nomeBase # print('executando:\n'+path) # print('Sem acento:\n'+('Sim' if(acento) else 'Não')) # nomeBase = 'sce/balanced/colecao_dourada_3_class_balanced.csv' # path = "experimento2/"+nomeBase base = readBase(nomeBase) tamBase = len(base) i = 0 documents = [] #print base[0][0].split() tknzr = nltk.tokenize.TweetTokenizer() while (i < tamBase): if (acento): w = remocaoacento(tknzr.tokenize(base[i][0])) else: w = tknzr.tokenize(base[i][0]) w = remocaopontos(w) conteudoLista = (w, base[i][1]) documents.append(conteudoLista) i += 1 stemmer = nltk.stem.RSLPStemmer() # h=0 # j=len(documents) # while (h<j): # g=len(documents[h][0]) # f=0 # while(f<g): # stemmer.stem(documents[h][0][f]) # f+=1 # h += 1 random.shuffle(documents) all_words = [] k = 0 l = len(documents) while (k < l): m = len(documents[k][0]) n = 0 while (n < m): all_words.append(documents[k][0][n]) n += 1 k += 1 # print(str(all_words)) #all_words = nltk.FreqDist(all_words) #calcula frequencia de palavras, definir o limite de palavras #all_words = nltk.LaplaceProbDist(nltk.FreqDist(all_words)) #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words)) all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1) #nltk.WittenBellProbDist() procurar como mudar o ngram #all_words = nltk.MLEProbDist(nltk.FreqDist(all_words)) def wordbigram(word_feature): bigram = [] i = 0 l = len(word_feature) - 1 while (i < l): s = tuple([ stemmer.stem(word_feature[i]), stemmer.stem(word_feature[i + 1]) ]) bigram.append(s) i += 1 return bigram def removerpalavras(todas_palavras, document): #remover as palavras que não estãoem todas as palavras linha = [] for w in document: if (w in todas_palavras): linha.append(w) return linha def wordFeature(documents): #cria um dicionario de dados dicionario = [] for w in documents: for q in w[0]: if (not q in dicionario): dicionario.append(q) return dicionario documents = [[removerpalavras(all_words.samples(), w[0]), w[1]] for w in documents] documents = [[wordbigram(w[0]), w[1]] for w in documents] word_features = wordFeature( documents ) #se 0usando FreqDistlista com palavras que aparecem mais de 3000 # print(str(len(word_features))) # exit() # word_features = list(all_words.samples())#se 0usando FreqDistlista com palavras que aparecem mais de 3000 def find_features(document): # words = set(document) features = {} i = 0 l = len(word_features) while (i < l): features[str(i)] = (word_features[i] in document) i += 1 return features featuresets = [(find_features(rev), category) for (rev, category) in documents] # print(str(featuresets)) # for (w,category) in featuresets: # print(str(len(w))+","+category+"\n") kfold = 4 # baseInteira = featuresets tamT = len(featuresets) divisao = tamT // kfold ###### ajustar divisao baseDividida1 = featuresets[0:divisao] baseDividida2 = featuresets[divisao:(divisao * 2)] baseDividida3 = featuresets[(divisao * 2):(divisao * 3)] baseDividida4 = featuresets[(divisao * 3):tamT] #tamT = len(featuresets) #umQuarto = tamBase/4 #training_set = featuresets[umQuarto:] #testing_set = featuresets[:umQuarto] #training_set = featuresets[100:] #testing_set = featuresets[0:100] ########################## 1 rodada #print "## RODADA 1 ##" # print("treino") training_set = baseDividida2 + baseDividida3 + baseDividida4 testing_set = baseDividida1 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB1 = 0 while (g < len(precisaoMNB1)): somaPMNB1 = somaPMNB1 + precisaoMNB1[g] g = g + 1 MNBpt1 = (somaPMNB1 / len(precisaoMNB1)) * 100 MNBrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB1 = 0 while (g < len(recallMNB1)): somaRMNB1 = somaRMNB1 + recallMNB1[g] g = g + 1 MNBrt1 = (somaRMNB1 / len(recallMNB1)) * 100 MNBfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB1 = 0 while (g < len(f1MNB1)): somaFMNB1 = somaFMNB1 + f1MNB1[g] g = g + 1 MNBft1 = (somaFMNB1 / len(f1MNB1)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR1 = 0 while (g < len(precisaoR1)): somaPR1 = somaPR1 + precisaoR1[g] g = g + 1 Rpt1 = (somaPR1 / len(precisaoR1)) * 100 Rrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR1 = 0 while (g < len(recallR1)): somaRR1 = somaRR1 + recallR1[g] g = g + 1 Rrt1 = (somaRR1 / len(recallR1)) * 100 Rfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR1 = 0 while (g < len(f1R1)): somaFR1 = somaFR1 + f1R1[g] g = g + 1 Rft1 = (somaFR1 / len(f1R1)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) La1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL1 = 0 while (g < len(precisaoL1)): somaPL1 = somaPL1 + precisaoL1[g] g = g + 1 Lpt1 = (somaPL1 / len(precisaoL1)) * 100 Lrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL1 = 0 while (g < len(recallL1)): somaRL1 = somaRL1 + recallL1[g] g = g + 1 Lrt1 = (somaRL1 / len(recallL1)) * 100 Lfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL1 = 0 while (g < len(f1L1)): somaFL1 = somaFL1 + f1L1[g] g = g + 1 Lft1 = (somaFL1 / len(f1L1)) * 100 ######################## Rodada 2 #print "## RODADA 2 ##" training_set = baseDividida1 + baseDividida3 + baseDividida4 testing_set = baseDividida2 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB2 = 0 while (g < len(precisaoMNB2)): somaPMNB2 = somaPMNB2 + precisaoMNB2[g] g = g + 1 MNBpt2 = (somaPMNB2 / len(precisaoMNB2)) * 100 MNBrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB2 = 0 while (g < len(recallMNB2)): somaRMNB2 = somaRMNB2 + recallMNB2[g] g = g + 1 MNBrt2 = (somaRMNB2 / len(recallMNB2)) * 100 MNBfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB2 = 0 while (g < len(f1MNB2)): somaFMNB2 = somaFMNB2 + f1MNB2[g] g = g + 1 MNBft2 = (somaFMNB2 / len(f1MNB2)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR2 = 0 while (g < len(precisaoR2)): somaPR2 = somaPR2 + precisaoR2[g] g = g + 1 Rpt2 = (somaPR2 / len(precisaoR2)) * 100 Rrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR2 = 0 while (g < len(recallR2)): somaRR2 = somaRR2 + recallR2[g] g = g + 1 Rrt2 = (somaRR2 / len(recallR2)) * 100 Rfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR2 = 0 while (g < len(f1R2)): somaFR2 = somaFR2 + f1R2[g] g = g + 1 Rft2 = (somaFR2 / len(f1R2)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) La2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL2 = 0 while (g < len(precisaoL2)): somaPL2 = somaPL2 + precisaoL2[g] g = g + 1 Lpt2 = (somaPL2 / len(precisaoL2)) * 100 Lrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL2 = 0 while (g < len(recallL2)): somaRL2 = somaRL2 + recallL2[g] g = g + 1 Lrt2 = (somaRL2 / len(recallL2)) * 100 Lfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL2 = 0 while (g < len(f1L2)): somaFL2 = somaFL2 + f1L2[g] g = g + 1 Lft2 = (somaFL2 / len(f1L2)) * 100 ##################### rodada 3 #print "## RODADA 3 ##" training_set = baseDividida1 + baseDividida2 + baseDividida4 testing_set = baseDividida3 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB3 = 0 while (g < len(precisaoMNB3)): somaPMNB3 = somaPMNB3 + precisaoMNB3[g] g = g + 1 MNBpt3 = (somaPMNB3 / len(precisaoMNB3)) * 100 MNBrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB3 = 0 while (g < len(recallMNB3)): somaRMNB3 = somaRMNB3 + recallMNB3[g] g = g + 1 MNBrt3 = (somaRMNB3 / len(recallMNB3)) * 100 MNBfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB3 = 0 while (g < len(f1MNB3)): somaFMNB3 = somaFMNB3 + f1MNB3[g] g = g + 1 MNBft3 = (somaFMNB3 / len(f1MNB3)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR3 = 0 while (g < len(precisaoR3)): somaPR3 = somaPR3 + precisaoR3[g] g = g + 1 Rpt3 = (somaPR3 / len(precisaoR3)) * 100 Rrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR3 = 0 while (g < len(recallR3)): somaRR3 = somaRR3 + recallR3[g] g = g + 1 Rrt3 = (somaRR3 / len(recallR3)) * 100 Rfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR3 = 0 while (g < len(f1R3)): somaFR3 = somaFR3 + f1R3[g] g = g + 1 Rft3 = (somaFR3 / len(f1R3)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) La3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL3 = 0 while (g < len(precisaoL3)): somaPL3 = somaPL3 + precisaoL3[g] g = g + 1 Lpt3 = (somaPL3 / len(precisaoL3)) * 100 Lrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL3 = 0 while (g < len(recallL3)): somaRL3 = somaRL3 + recallL3[g] g = g + 1 Lrt3 = (somaRL2 / len(recallL2)) * 100 Lfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL3 = 0 while (g < len(f1L3)): somaFL3 = somaFL3 + f1L3[g] g = g + 1 Lft3 = (somaFL3 / len(f1L3)) * 100 ############################ rodada 4 #print "## RODADA 4 ##" training_set = baseDividida1 + baseDividida2 + baseDividida3 testing_set = baseDividida4 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB4 = 0 while (g < len(precisaoMNB4)): somaPMNB4 = somaPMNB4 + precisaoMNB4[g] g = g + 1 MNBpt4 = (somaPMNB4 / len(precisaoMNB4)) * 100 MNBrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB4 = 0 while (g < len(recallMNB4)): somaRMNB4 = somaRMNB4 + recallMNB4[g] g = g + 1 MNBrt4 = (somaRMNB4 / len(recallMNB4)) * 100 MNBfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB4 = 0 while (g < len(f1MNB4)): somaFMNB4 = somaFMNB4 + f1MNB4[g] g = g + 1 MNBft4 = (somaFMNB4 / len(f1MNB4)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR4 = 0 while (g < len(precisaoR4)): somaPR4 = somaPR4 + precisaoR4[g] g = g + 1 Rpt4 = (somaPR4 / len(precisaoR4)) * 100 Rrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR4 = 0 while (g < len(recallR4)): somaRR4 = somaRR4 + recallR4[g] g = g + 1 Rrt4 = (somaRR4 / len(recallR4)) * 100 Rfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR4 = 0 while (g < len(f1R4)): somaFR4 = somaFR4 + f1R4[g] g = g + 1 Rft4 = (somaFR4 / len(f1R4)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) La4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL4 = 0 while (g < len(precisaoL4)): somaPL4 = somaPL4 + precisaoL4[g] g = g + 1 Lpt4 = (somaPL4 / len(precisaoL4)) * 100 Lrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL4 = 0 while (g < len(recallL4)): somaRL4 = somaRL4 + recallL4[g] g = g + 1 Lrt4 = (somaRL4 / len(recallL4)) * 100 Lfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL4 = 0 while (g < len(f1L4)): somaFL4 = somaFL4 + f1L4[g] g = g + 1 Lft4 = (somaFL4 / len(f1L4)) * 100 ################# medias #print "## MEDIA ##" #MULTINOMINAL MNBmc = (MNBmc1 + MNBmc2 + MNBmc3 + MNBmc4) / 4 MNBa = (MNBa1 + MNBa2 + MNBa3 + MNBa4) / 4 MNBamax = max([MNBa1, MNBa2, MNBa3, MNBa4]) MNBamin = min([MNBa1, MNBa2, MNBa3, MNBa4]) MNBpp = (MNBpp4 + MNBpp4 + MNBpp4 + MNBpp4) / 4 MNBpt = (MNBpt1 + MNBpt2 + MNBpt3 + MNBpt4) / 4 MNBpmax = max([MNBpt1, MNBpt2, MNBpt3, MNBpt4]) MNBpmin = min([MNBpt1, MNBpt2, MNBpt3, MNBpt4]) MNBrp = (MNBrp1 + MNBrp2 + MNBrp3 + MNBrp4) / 4 MNBrt = (MNBrt1 + MNBrt2 + MNBrt3 + MNBrt4) / 4 MNBrmax = max([MNBrt1, MNBrt2, MNBrt3, MNBrt4]) MNBrmin = min([MNBrt1, MNBrt2, MNBrt3, MNBrt4]) MNBfp = (MNBfp1 + MNBfp2 + MNBfp3 + MNBfp4) / 4 MNBft = (MNBft1 + MNBft2 + MNBft3 + MNBft4) / 4 MNBfmax = max([MNBft1, MNBft2, MNBft3, MNBft4]) MNBfmin = min([MNBft1, MNBft2, MNBft3, MNBft4]) ''' fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.set_aspect('equal') plt.imshow(MNBmc, interpolation='nearest', cmap=plt.cm.ocean) plt.colorbar() plt.show() ''' #REGRESSAO LINEAR Rmc = (Rmc1 + Rmc2 + Rmc3 + Rmc4) / 4 Ra = (Ra1 + Ra2 + Ra3 + Ra4) / 4 Ramax = max([Ra1, Ra2, Ra3, Ra4]) Ramin = min([Ra1, Ra2, Ra3, Ra4]) Rpp = (Rpp4 + Rpp4 + Rpp4 + Rpp4) / 4 Rpt = (Rpt1 + Rpt2 + Rpt3 + Rpt4) / 4 Rpmax = max([Rpt1, Rpt2, Rpt3, Rpt4]) Rpmin = min([Rpt1, Rpt2, Rpt3, Rpt4]) Rrp = (Rrp1 + Rrp2 + Rrp3 + Rrp4) / 4 Rrt = (Rrt1 + Rrt2 + Rrt3 + Rrt4) / 4 Rrmax = max([Rrt1, Rrt2, Rrt3, Rrt4]) Rrmin = min([Rrt1, Rrt2, Rrt3, Rrt4]) Rfp = (Rfp1 + Rfp2 + Rfp3 + Rfp4) / 4 Rft = (Rft1 + Rft2 + Rft3 + Rft4) / 4 Rfmax = max([Rft1, Rft2, Rft3, Rft4]) Rfmin = min([Rft1, Rft2, Rft3, Rft4]) #SVC LINEAR Lmc = (Lmc1 + Lmc2 + Lmc3 + Lmc4) / 4 La = (La1 + La2 + La3 + La4) / 4 Lamax = max([La1, La2, La3, La4]) Lamin = min([La1, La2, La3, La4]) Lpp = (Lpp4 + Lpp4 + Lpp4 + Lpp4) / 4 Lpt = (Lpt1 + Lpt2 + Lpt3 + Lpt4) / 4 Lpmax = max([Lpt1, Lpt2, Lpt3, Lpt4]) Lpmin = min([Lpt1, Lpt2, Lpt3, Lpt4]) Lrp = (Lrp1 + Lrp2 + Lrp3 + Lrp4) / 4 Lrt = (Lrt1 + Lrt2 + Lrt3 + Lrt4) / 4 Lrmax = max([Lrt1, Lrt2, Lrt3, Lrt4]) Lrmin = min([Lrt1, Lrt2, Lrt3, Lrt4]) Lfp = (Lfp1 + Lfp2 + Lfp3 + Lfp4) / 4 Lft = (Lft1 + Lft2 + Lft3 + Lft4) / 4 Lfmax = max([Lft1, Lft2, Lft3, Lft4]) Lfmin = min([Lft1, Lft2, Lft3, Lft4]) ''' print "SVC Linear" print "Matriz de confusão: ", Lmc print "Acuracia: ", La print "Precisão parcial: ", Lpp print "Precisão total: ", Lpt print "Recall parcial: ", Lrp print "Recall total: ", Lrt print "F-medida parcial: ", Lfp print "F-medida total: ", Lft ''' print(experimento + ':' + str(MNBa) + '\t' + str(Ra) + '\t' + str(La)) with open(path, mode='w') as csv_file: #writer = csv.writer(csv_file) csv_file.writelines('Algoritmo' + ';' + 'Multinominal Naïve-Bayes' + '\n') csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' + 'Precisão parcial' + ';' + 'Precisão total' + ';' + 'revocação parcial' + ';' + 'revocação total' + ';' + 'f-medida parcial' + ';' + 'f-medida total' + '\n') csv_file.writelines('1;' + str(MNBa1) + ';' + str(MNBpp1) + ';' + str(MNBpt1) + ';' + str(MNBrp1) + ';' + str(MNBrt1) + ';' + str(MNBfp1) + ';' + str(MNBft1) + '\n') csv_file.writelines('2;' + str(MNBa2) + ';' + str(MNBpp2) + ';' + str(MNBpt2) + ';' + str(MNBrp2) + ';' + str(MNBrt2) + ';' + str(MNBfp2) + ';' + str(MNBft2) + '\n') csv_file.writelines('3;' + str(MNBa3) + ';' + str(MNBpp3) + ';' + str(MNBpt3) + ';' + str(MNBrp3) + ';' + str(MNBrt3) + ';' + str(MNBfp3) + ';' + str(MNBft3) + '\n') csv_file.writelines('4;' + str(MNBa4) + ';' + str(MNBpp4) + ';' + str(MNBpt4) + ';' + str(MNBrp4) + ';' + str(MNBrt4) + ';' + str(MNBfp4) + ';' + str(MNBft4) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Total' + '\n') csv_file.writelines('Média;' + str(MNBa) + ';' + str(MNBpp) + ';' + str(MNBpt) + ';' + str(MNBrp) + ';' + str(MNBrt) + ';' + str(MNBfp) + ';' + str(MNBft) + '\n') csv_file.writelines('Máximo;' + str(MNBamax) + "" + ';' + str(MNBpmax) + "" + ';' + str(MNBrmax) + "" + ';' + str(MNBfmax) + '\n') csv_file.writelines('Mínimo;' + str(MNBamin) + "" + ';' + str(MNBpmin) + "" + ';' + str(MNBrmin) + "" + ';' + str(MNBfmin) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Algoritmo' + ';' + 'Regressão Linear' + '\n') csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' + 'Precisão parcial' + ';' + 'Precisão total' + ';' + 'revocação parcial' + ';' + 'revocação total' + ';' + 'f-medida parcial' + ';' + 'f-medida total' + '\n') csv_file.writelines('1;' + str(Ra1) + ';' + str(Rpp1) + ';' + str(Rpt1) + ';' + str(Rrp1) + ';' + str(Rrt1) + ';' + str(Rfp1) + ';' + str(Rft1) + '\n') csv_file.writelines('2;' + str(Ra2) + ';' + str(Rpp2) + ';' + str(Rpt2) + ';' + str(Rrp2) + ';' + str(Rrt2) + ';' + str(Rfp2) + ';' + str(Rft2) + '\n') csv_file.writelines('3;' + str(Ra3) + ';' + str(Rpp3) + ';' + str(Rpt3) + ';' + str(Rrp3) + ';' + str(Rrt3) + ';' + str(Rfp3) + ';' + str(Rft3) + '\n') csv_file.writelines('4;' + str(Ra4) + ';' + str(Rpp4) + ';' + str(Rpt4) + ';' + str(Rrp4) + ';' + str(Rrt4) + ';' + str(Rfp4) + ';' + str(Rft4) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Total' + '\n') csv_file.writelines('Média;' + str(Ra) + ';' + str(Rpp) + ';' + str(Rpt) + ';' + str(Rrp) + ';' + str(Rrt) + ';' + str(Rfp) + ';' + str(Rft) + '\n') csv_file.writelines('Máximo;' + str(Ramax) + "" + ';' + str(Rpmax) + "" + ';' + str(Rrmax) + "" + ';' + str(Rfmax) + '\n') csv_file.writelines('Mínimo;' + str(Ramin) + "" + ';' + str(Rpmin) + "" + ';' + str(Rrmin) + "" + ';' + str(Rfmin) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Algoritmo' + ';' + 'SVC Linear' + '\n') csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' + 'Precisão parcial' + ';' + 'Precisão total' + ';' + 'revocação parcial' + ';' + 'revocação total' + ';' + 'f-medida parcial' + ';' + 'f-medida total' + '\n') csv_file.writelines('1;' + str(La1) + ';' + str(Lpp1) + ';' + str(Lpt1) + ';' + str(Lrp1) + ';' + str(Lrt1) + ';' + str(Lfp1) + ';' + str(Lft1) + '\n') csv_file.writelines('2;' + str(La2) + ';' + str(Lpp2) + ';' + str(Lpt2) + ';' + str(Lrp2) + ';' + str(Lrt2) + ';' + str(Lfp2) + ';' + str(Lft2) + '\n') csv_file.writelines('3;' + str(La3) + ';' + str(Lpp3) + ';' + str(Lpt3) + ';' + str(Lrp3) + ';' + str(Lrt3) + ';' + str(Lfp3) + ';' + str(Lft3) + '\n') csv_file.writelines('4;' + str(La4) + ';' + str(Lpp4) + ';' + str(Lpt4) + ';' + str(Lrp4) + ';' + str(Lrt4) + ';' + str(Lfp4) + ';' + str(Lft4) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Total' + '\n') csv_file.writelines('Média;' + str(La) + ';' + str(Lpp) + ';' + str(Lpt) + ';' + str(Lrp) + ';' + str(Lrt) + ';' + str(Lfp) + ';' + str(Lft) + '\n') csv_file.writelines('Máximo;' + str(Lamax) + "" + ';' + str(Lpmax) + "" + ';' + str(Lrmax) + "" + ';' + str(Lfmax) + '\n') csv_file.writelines('Mínimo;' + str(Lamin) + "" + ';' + str(Lpmin) + "" + ';' + str(Lrmin) + "" + ';' + str(Lfmin) + '\n')
train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT) print('len(train)', len(train)) TEXT.build_vocab(train) print('len(TEXT.vocab)', len(TEXT.vocab)) if False: TEXT.build_vocab(train, max_size=1000) len(TEXT.vocab) nltk.download('brown') brown_1gram = nltk.FreqDist(brown.words()) brown_2gram = nltk.ConditionalFreqDist(nltk.bigrams(brown.words())) brown_trigrams = nltk.trigrams(brown.words()) condition_pairs = (((w0, w1), w2) for w0, w1, w2 in brown_trigrams) brown_3gram = nltk.ConditionalFreqDist(condition_pairs) class NamedBpttIterator(BPTTIterator): def __iter__(self): text = self.dataset[0].text TEXT = self.dataset.fields['text'] TEXT.eos_token = None text = text + ([TEXT.pad_token] * int( math.ceil(len(text) / self.batch_size) * self.batch_size - len(text))) data = TEXT.numericalize([text], device=self.device)
def frequence_words(): all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) return all_words.most_common(15)
tf_df = pd.DataFrame(columns=word_list, index = range(len(training_data))) avg_words = 0 for i in range(len(training_data)): if i%1000 == 0: print ("iteration", i) tokenized_conv = tokenizer.tokenize(training_data.loc[i, 'conversation']) tokenized_conv = [word for word in tokenized_conv if word not in cachedStopWords] tagText = nltk.pos_tag(tokenized_conv, tagset='universal') words = [] n_words = 0 for word, pos in tagText: if pos == 'NOUN' or pos == 'VERB': words.append(word_lemma.lemmatize(word)) freq = nltk.FreqDist(words) tf_entry = pd.DataFrame(columns=word_list) for word in word_list: if word in freq: tf_df.loc[i, word] = freq[word] n_words += 1 else: tf_df.loc[i, word] = 0 avg_words += n_words avg_words = avg_words/len(training_data) print(avg_words) #after reading all conversations tf_df.to_csv('feature_set_1.csv')
def main(argv=None): analyzer = SiteAnalyzer() # if argv is None: # argv = sys.argv # urls = [] argv = [ # 'deep', '-o', 'sheet.xlsx', 'https://www.apptunix.com/', 'https://trio.dev/', 'https://youteam.io/', 'https://www.daxx.com/', # 'https://distantjob.com/', # 'https://relevant.software/', # 'https://remotemore.com/', # 'https://www.peerbits.com/', # 'https://codersera.com/', # 'https://www.makeitinua.com', # 'https://soshace.com/', # 'https://intersog.com/', # 'https://x-team.com/', # 'https://www.quickmonday.com/', # 'https://hackernoon.com/', # 'https://www.classicinformatics.com', # 'https://www.remoteco.com/', # 'https://remoteplatz.com/', # 'https://stormotion.io/', # 'https://www.trustshoring.com/', 'https://geektastic.com/' ] try: wb = load_workbook(filename = str(argv[argv.index('-o')+1])) if '-o' in argv and wb: temp_list = [] for link in argv: if analyzer.url_validator(link): temp_list.append(link) search_list = [] if 'deep' in argv: for link in temp_list: try: analyzer.parse_page_for_subpages(link, search_list) except HTTPError: pass else: search_list = temp_list results = {} for link in tqdm(search_list): if analyzer.url_validator(link) and ".pdf" not in link: print(link) try: _ = analyzer.get_soup(link) except InvalidURL: continue if _ is None: continue try: _ = analyzer.tokenize_soup(_) _ = analyzer.textify_data(_) fd = nltk.FreqDist(_) except TypeError: continue for key, value in dict(fd.most_common(10)).items(): if key not in results: results.update({key:value}) else: results[key] += value analyzer.add_to_datasheet(wb, input_array) except ValueError: print("To select output file, please, pass -o a.xlsx file as an argument") exit()