def bigrams(text, exclude=pronouns, freq=3, limit=10): temp = [] bigram_measures = collocations.BigramAssocMeasures() finder = collocations.BigramCollocationFinder.from_words( word_tokenize(text)) finder.apply_word_filter(lambda w: w in exclude) finder.apply_freq_filter(freq) [temp.append(each) for each in finder.nbest(bigram_measures.pmi, limit)] return temp
def bigrams(tweets_words,stop_words): """ Creates bigrams out of a dataset """ bigrams_measures = collocations.BigramAssocMeasures() bigram_finder = collocations.BigramCollocationFinder.from_words(tweets_words) bigram_freq = bigram_finder.ngram_fd.items() bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False) filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x,stop_words))] freq_bi = filtered_bi.bigram.values return freq_bi
def get_collocation_pairs(article, data_save_file): nouns = get_nouns(article) measures = collocations.BigramAssocMeasures() tagged_words = Mecab().pos(article) if data_save_file != False: np.save(data_save_file, tagged_words) finder = collocations.BigramCollocationFinder.from_words(tagged_words) score_words = finder.score_ngrams(measures.likelihood_ratio) word_pairs = find_NNpairs(score_words) return word_pairs
def retrieve(self, request, *args, **kwargs): jpype.attachThreadToJVM() board = Board.objects.get(pk=kwargs['pk']) measures = collocations.BigramAssocMeasures() tagged_words = Twitter().pos(board.content) finder = collocations.BigramCollocationFinder.from_words(tagged_words) result = finder.nbest(measures.pmi, 10) # top 5 n-grams with highest PMI text_result = "" for tuples in result: text_result += ",".join(tuples[0]) text_result += "|" text_result += ",".join(tuples[1]) response = BoardAnalyze.objects.create(board_id=board, result=text_result) serializer = self.get_serializer(response) return Response(serializer.data)
def retrieve(self, request, *args, **kwargs): jpype.attachThreadToJVM() board = Board.objects.get(pk=kwargs['pk']) measures = collocations.BigramAssocMeasures() tagged_words = Twitter().pos(board.content) words = [w for w, t in tagged_words] ignored_words = [u'안녕'] finder = collocations.BigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words) finder.apply_freq_filter(3) # only bigrams that appear 3+ times result = finder.nbest(measures.pmi, 10) if result: text_result = ",".join(result[0]) else: text_result = "" response = BoardAnalyze.objects.create(board_id=board, result=text_result) serializer = self.get_serializer(response) return Response(serializer.data)
def extract_top_bigrams_collocations(collection, num=10, frequencyThreshold=3, windows_size=5, filter_word=None): """ This methods extracts, for each document collection, the top N bigram collocations. Bigram collocations are pairs of words which commonly co-occur. With a windows_size < 2, only bigrams formed by consecutive words will be taken into account. In that case, the result is consistent with a list of 2-words expressions that frequently appear in the collection. For windows_size > 2, all pairs of words within a windows of windows_size words will be considered. In that case, the result is consistent with a list of 2 related words that frequently co-occur together and therefore commonly have a semantic relationship between them """ from nltk import collocations words = tokenize_collection(collection) bigram_measures = collocations.BigramAssocMeasures() if windows_size > 2: finder = collocations.BigramCollocationFinder.from_words(words,windows_size) else: finder = collocations.BigramCollocationFinder.from_words(words) finder.apply_freq_filter(frequencyThreshold) if filter_word: finder.apply_ngram_filter(lambda *w: filter_word not in w) return finder.nbest(bigram_measures.chi_sq, num)
def main(): total = len(sys.argv) if total < 1: print "Utilization: python gen_graph.py <input_file>" exit(0) twts = read_json(str(sys.argv[1])) print "reading and cleanup done!" collocation = collocations.BigramCollocationFinder.from_documents(twts) bigram_measures = collocations.BigramAssocMeasures() print "Creating Bi-grams Collocation" c_list = [] for each in collocation.ngram_fd.viewitems(): if each[1] > 1: c_list.append(each) c_list.sort(key=operator.itemgetter(1), reverse=True) print "Generating Graph" g = nx.Graph() for each in c_list[:50000]: g = add_node(g, each[0][0]) g = add_node(g, each[0][1]) g = add_edge(g, each[0][0], each[0][1], each[1]) print len(g) nx.write_graphml(g, '../data/test_graph_words_pos.graphml') print "Done"
def phrase_list(filename): article = open(filename, 'r').readlines() try: data = article[2:] content = data[0].decode("utf-8")[:-1] for paragraph in data[1:]: content = content + " " + paragraph.decode("utf-8")[:-1] except: return [] txt = content sentences = [s for s in nltk.tokenize.sent_tokenize(txt)] normalized_sentences = [s.lower() for s in sentences] words = [ w for sentence in normalized_sentences for w in nltk.tokenize.word_tokenize(sentence) if w not in stop_words and not re.match(prog_num, w) ] words = [w.strip(u'\u201c\u201d\u2018') for w in words] words = [w.strip(',.') for w in words] #print words #words = Lemmatizer([words]) #print words bigram_measures = collocations.BigramAssocMeasures() bigram_finder = collocations.BigramCollocationFinder.from_words(words) #bigram_finder.apply_freq_filter(BIGRAM_FILTER) threshold = min(int(0.05 * len(words)), 100) phrase = [] for bigram in bigram_finder.score_ngrams( bigram_measures.raw_freq)[:threshold]: phrase.append(bigram[0]) return phrase
while tmp[tmpIndex] != ' ': if tmpIndex > 0: tmpIndex -= 1 else: break while tmp[index] != ' ': if len(tmp) - 1 != index: index += 1 else: break return " " + tmp[tmpIndex:tmp.find('-')] + " " + tmp[tmp.find('-') + 1:index] mecab = Mecab() bigram_measures = collocations.BigramAssocMeasures() import time import sys from threading import Condition _CONDITION = Condition() @route('/classify') def classify(): reload(sys) sys.setdefaultencoding('utf-8') specialLetter = "( ) [ ] { } % # & * @ § ※ ☆ ★ ○ ● ◎ ◇ ◆ □ ■ △ ▲ ▽ ▼ → ← ↑ ↓ ↔ 〓 ◁ ◀ ▷ ▶ ♤ ♠ ♡ ♥ ♧ ♣ ⊙ ◈ ▣ ◐ ◑ ▒ ▤ ▥ ▨ ▧ ▦ ▩ ♨ ☏ ☎ ☜ ☞ ¶ † ‡ ↕ ↗ ↙ ↖ ↘ ♭ ♩ ♪ ♬ ㉿ ㈜ № ㏇ ㏂ ㏘ ℡ ? ª º ☞ ☜ ▒ " specialLetter += "─ │ ┌ ┐ ┘ └ ├ ┬ ┤ ┴ │ ━ ┃ ┏ ┓ ┛ ┗ ┣ ┳ ┫ ┻ ╋ ┠ ┯ ┨ ┷ ┿ ┝ ┰ ┥ ┸ ╂ ┒ ┑ ┚ ┙ ┖ ┕ ┎ ┍ ┞ ┟ ┡ ┢ ┦ ┧ ┩ ┪ ┭ ┮ ┱ ┲ ┵ ┶ ┹ ┺ ┽ ┾ ╀ ╁ ╃ ╄ ╅ ╆ ╇ ╈ ╉ ╊ "
def n_gram_creator(tokens, top_n=20, n=2, freq_filter=None, window_size=None, counts=False, show_freq=True, show_pmi=False, keep=None): # Helper function creating [2-4]grams with a variety of options import nltk.collocations as colloc from nltk import bigrams, trigrams ## Check if n-gram is supported if n in [2, 3, 4]: ## Allowing for non-contiguous ngram creation if isinstance(window_size, int): window = window_size else: window = n ## Bigram setup if n == 2: word = 'Bi' if counts: ngrams = bigrams(tokens) return ngrams else: ngram_measures = colloc.BigramAssocMeasures() ngram_finder = colloc.BigramCollocationFinder.from_words( tokens, window_size=window) ## Trigram setup elif n == 3: word = 'Tri' if counts: ngrams = trigrams(tokens) return ngrams else: ngram_measures = colloc.TrigramAssocMeasures() ngram_finder = colloc.TrigramCollocationFinder.from_words( tokens, window_size=window) ## Quadgram setup elif n == 4: word = 'Quad' ngram_measures = colloc.QuadgramAssocMeasures() ngram_finder = colloc.QuadgramCollocationFinder.from_words( tokens, window_size=window) ## Applying frequency filter to results if selected for if isinstance(freq_filter, int): ngram_finder.apply_freq_filter(freq_filter) ## Create ngram scores ngram_score = ngram_finder.score_ngrams(ngram_measures.raw_freq) ngram_pmi_score = ngram_finder.score_ngrams(ngram_measures.pmi) ## Optional display if show_freq: print(f'Top {top_n} {word}-grams by frequency') display(ngram_score[:top_n]) ## Optional display if show_pmi: print(f'PMI score for {top_n} {word}-grams') display(ngram_pmi_score[:top_n]) ## Optional return if keep == 'score': return ngram_score elif keep == 'pmi': return ngram_pmi_score ## Messaging for non-supported ngrams else: return f"{n}-grams are not supported. Try 2, 3, or 4."
import logging import nltk from nltk import collocations from typing import List, Tuple from freqdist.models import TextFile from kwic.utils import _clean_texts, _get_texts from core.models import Example logger = logging.getLogger(__name__) MEASURES_FINDERS_DICT = { 'bigram': [ collocations.BigramAssocMeasures(), collocations.BigramCollocationFinder, ], 'trigram': [ collocations.TrigramAssocMeasures(), collocations.TrigramCollocationFinder, ], 'quadgram': [ collocations.QuadgramAssocMeasures(), collocations.QuadgramCollocationFinder, ] } def get_collocates(ngram: str, assoc_measure: str, include_examples: bool,
def setModel() : measures = collocations.BigramAssocMeasures() twitter = Twitter() path = "./soma_classifier.csv" train_df = pd.read_csv(path) # train_df = pd.read_pickle("soma_goods_train.df") # nltk.download("stopwords") # nltk.download("punkt") # nltk.download("maxent_treebank_pos_tagger") # nltk.download('maxent_treebank_pos_tagger') # nltk.download("all") # pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]''' # 학습하는 상품 name들을 리스트로 만든다. d_list = [] # 카테고리 분류명에 구분자(';')를 줘서 리스트를 만든다 cate_list = [] # iterrows() : 행의 이름과 값들을 쌍으로 조회 for each in train_df.iterrows(): # join() : ';'를 구분자로 내용들을 합친다 cate = ";".join([each[1]['cate1'], each[1]['cate2'], each[1]['cate3']]) d_list.append(each[1]['name']) cate_list.append(cate) # print(type(d_list[0])) print(d_list) # print(len(d_list)) # words =[] # print('Collocations among tagged words:') # for d in d_list : # words.append(' '.join(twitter.nouns(d))) # # print(words) pattern = r'''(?x) ([A-Za-z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]''' pat = r'[a-zA-Z]+' # pat = r'[a - zA - Z]+' words=[] words_kor = [] # print('Collocations among tagged words:') for d in d_list : tokens = word_tokenize(d) # tokens = d.split() # print(tokens) # print(twitter.nouns(d)) words_kor.append(' '.join(twitter.nouns(d))) words.append(' '.join(tokens)) # print(words) # print(words_kor) # print(len(words_kor)) words_en = [] for word in words: temp = [] # print("word : " + word) tokens_en = regexp_tokenize(word, pat) print(tokens_en) temp.append(' '.join(tokens_en)) for temp in tokens_en: # print("temp : " + temp) # tagged_word = nltk.pos_tag(temp.split()) tagged_word = nltk.pos_tag(word_tokenize(temp)) print(tagged_word) nouns = [token for token, pos in tagged_word if pos.startswith('N')] # print(nouns) words_en.append(nouns) # words_en.append(temp) # nouns = [word for word, pos in tagged_sent if pos == 'Noun'] # print(nouns) # words_en.append(nouns) # print(words_en) # print(len(words_en)) # for word in words_en : # print(word) # print(word[0]) # tagged_word = nltk.pos_tag(word[0].split()) # nouns = [token for token, pos in tagged_word if pos.startswith('N')] # words_en.append(' '.join(nouns)) # print(len(words_en)) # print(len(words)) # 여기까지!!!!!!~~~~~~~~~~~!~!~!~!~!@~!@~!@~!@!~@!@~!~!~!~!~!~!~!~!~!~!~!~!~!~!!!!!!!~~~~~~~~~~~!~!~!~!~!@~!@~!@~!@!~@!@~!~!~!~!~!~!~!~!~!~!~!~!~!~! # 같은종류를 묶어서 하나로...group by와 같다!! # print(set(cate_list)) # object to list # print(list(set(cate_list))) # 각 카테고리명에 대해서 serial 한 숫자 id를 부여한다. # cate_dict[카테고리명] = serial_id 형태이다. # dict() : value : key(id); 형태로 저장이 된다, cate_dict = dict(zip(list(set(cate_list)), range(len(set(cate_list))))) # print(cate_dict), {'디지털/가전;PC부품;CPU' : O, '패션의류;아동의류;한복':1} # print(cate_dict['디지털/가전;PC부품;CPU']), 0 # 단어 형태의 카테고리명에 대응되는 serial_id값들을 넣는다. y_list = [] for each in train_df.iterrows(): cate = ";".join([each[1]['cate1'], each[1]['cate2'], each[1]['cate3']]) y_list.append(cate_dict[cate]) # print(y_list) # print(len(y_list)) # 각 상품별로 name의 문장에 있는 단어들을 빈도수를 matrix형태로 만든다. # vectorizer = CountVectorizer() # x_list = vectorizer.fit_transform(d_list) # x_list = vectorizer.fit_transform(words) # print(x_list.shape) # print(x_list) # 단어들 출력 # for x in x_list: # print(x.indices) # print(x.data) # print(x.indptr) # for word in x.indices : # doc = vectorizer.get_feature_names()[word] # print(doc) # vectorizer100 = CountVectorizer(max_features=100) # x100_list = vectorizer100.fit_transform(d_list) # print(set(x100_list)) # print(len(vectorizer100.get_feature_names())) # 단어들 출력 # for x10 in x100_list: # for word in x10.indices : # temp = kkma.pos(vectorizer100.get_feature_names()[word]) # svc_param = {'C': np.logspace(-2, 0, 20)} # # svc_param = {'C': np.logspace(-2, 0, 5)} # # # gs_svc = GridSearchCV(LinearSVC(loss='l2'), svc_param, cv=5, n_jobs=4) # gs_svc.fit(x_list, y_list) # # print(gs_svc.best_params_, gs_svc.best_score_) # # # clf = LinearSVC(C=gs_svc.best_params_['C']) # clf.fit(x_list, y_list) # # joblib.dump(clf, 'classify.model', compress=3) # joblib.dump(cate_dict, 'cate_dict.dat', compress=3) # joblib.dump(vectorizer, 'vectorizer.dat', compress=3) return
def fit_transform(self, comments, y=None): designed, filtered_words_lower, filtered_words, comments_prep = \ self._preprocess(comments) empty_analyzer = lambda x: x self.unigram_vect = TfidfVectorizer(analyzer=empty_analyzer, min_df=3) print("vecorizing") unigrams = self.unigram_vect.fit_transform(filtered_words_lower) # pos tag vectorizer #self.pos_vect = TfidfVectorizer(analyzer=empty_analyzer).fit(tags) # fancy vectorizer self.you_are_vect = TfidfVectorizer( token_pattern="(?i)you are(?: an?)?(?: the)?(?: as)? (\w+)") you_are = self.you_are_vect.fit_transform(comments_prep) # get the google bad word list #with open("google_badlist.txt") as f: self.bigram_measures = col.BigramAssocMeasures() self.trigram_measures = col.TrigramAssocMeasures() # extract bigram collocations including "you" (and your?) #col.BigramCollocationFinder.from_words([w for c in #filtered_words_lower #for w in c], window_size=4) col_you_bi = col.BigramCollocationFinder.from_documents( filtered_words_lower) col_you_bi.apply_freq_filter(3) col_you_bi._apply_filter(lambda x, y: np.all([w != "you" for w in x])) # < 400 of these self.you_bigrams = col_you_bi.nbest(self.bigram_measures.chi_sq, 1000) self.col_you_bi = col_you_bi # make tfidfvectorizer that uses these bigrams self.bigram_vect_you = TfidfVectorizer( analyzer=make_collocation_analyzer(self.you_bigrams), min_df=3) you_bigrams = self.bigram_vect_you.fit_transform(filtered_words_lower) # extract trigram collocations col_you_tri = col.TrigramCollocationFinder.from_documents( filtered_words_lower) col_you_tri.apply_freq_filter(3) col_you_tri._apply_filter(lambda x, y: np.all([w != "you" for w in x])) # < 400 of these, too self.you_trigrams = col_you_tri.nbest(self.trigram_measures.chi_sq, 1000) self.col_you_tri = col_you_tri self.trigram_vect_you = TfidfVectorizer( analyzer=make_collocation_analyzer(self.you_trigrams, 3), min_df=3) you_trigrams = self.trigram_vect_you.fit_transform( filtered_words_lower) ## some handcrafted features! designed.extend( self._handcrafted( filtered_words, comments, filtered_words_lower, )) designed = np.array(designed).T self.scaler = MinMaxScaler() designed = self.scaler.fit_transform(designed) features = [] features.append(unigrams) features.append(you_bigrams) features.append(you_trigrams) features.append(you_are) #features.append(pos_unigrams) features.append(sparse.csr_matrix(designed)) features = sparse.hstack(features).tocsr() return features
if __name__ == '__main__': papers = load_papers() raw_spacy_papers, spacy_papers, clean_papers = clean_text(papers) sent_lengths, syllables, readability, asl, asw = get_Flesch_Kincaid( raw_spacy_papers) words, counts = get_top_grams([spacy_papers['Aryan'].text], n=1, top=3) make_zipf_plot(counts, words, title='Zipf plot of Aryan\'s essay') for stu in spacy_papers.keys(): words, counts = get_top_grams([spacy_papers[stu].text], n=1, top=3) ti = 'Zipf plot of {}\'s essay'.format(stu) path = '/home/nate/Dropbox/regis/RCC200/zipf/' make_zipf_plot(counts, words, title=ti, savepath=path, save=True) bigram_measures = nc.BigramAssocMeasures() trigram_measures = nc.TrigramAssocMeasures() finder = nc.BigramCollocationFinder.from_documents( [[word.text for word in spacy_papers['Aryan']]]) print('top 10 2-grams by PMI') top_bigrams = finder.nbest(bigram_measures.pmi, 10) if finder.ngram_fd.N(top_bigrams[0]) > 1: print('counts top bigram by PMI appears:') print(finder.ngram_fd.N(top_bigrams[0])) # analyze my essay for comparison with open( '/home/nate/Dropbox/regis/RCC200/essays/short_assignment_1_turing/raw_text.txt', 'rb') as f: text = f.read().decode('utf-8')
def get_bigram_analyzer(self, n, words): LOGGER.info("Building Bigram Analyzer") bigram_measures = collocations.BigramAssocMeasures() finder = collocations.BigramCollocationFinder.from_words(words) return BigramAnalyzer( finder.above_score(bigram_measures.likelihood_ratio, n))