def TkloadingTweetsAndUserInfoData(args, resultTextbox, window): ''' This function can load the data and perfrom the preprocessing ''' # Check if the pre-processed datasets (training, validation, test) exist. if the pre-processed data already exist, then load it rather than perform pr-processing again. if not os.path.isfile(os.path.join(args.dataset, args.pickle_name)): # Check if the pre-processed df (overall Pandas dataframe). If it doesn't exist, load the original data to perform preprocessing. if not os.path.isfile( os.path.join(args.dataset, args.pickle_name_beforeMapToIdx)): ''' resultTextbox.insert("end", "String_Here") -> This function for adding a String to the result box window.update_idletasks() -> This function make the wondow to update the result box. ''' # Adding the loading information to result box resultTextbox.insert( "end", ("Loading " + str(os.path.join(args.dataset, "FullTweetsDataNoOrdered.html")) + ' and ' + str( os.path.join(args.dataset, "FullExtraInfoDataNoOrdered.csv")) + " to do the Proprocessing\n")) window.update_idletasks() # Load the original data set, which contains a html (storing the tweets text) and a csv (storing other tweets' and users' information) # Load the tweets text tweets_df = pd.read_html( os.path.join(args.dataset, "FullTweetsDataNoOrdered.html")) tweets_df = pd.DataFrame(list(tweets_df[0].iloc[1:][0])) tweets_df.columns = ['text'] # Load other information extraInfo_df = pd.read_csv( os.path.join(args.dataset, "FullExtraInfoDataNoOrdered.csv")) # Concat two loaded dataframe df = pd.concat([tweets_df, extraInfo_df], axis=1) # Delete the loaded dataframe after concating since we have the concatinated df now. del tweets_df del extraInfo_df # Show the dataset size in the result box resultTextbox.insert("end", ("Dataset size: " + str(len(df)) + "\n")) window.update_idletasks() def preprocessingInputTextData(colName): ''' This function is used for preprocessing ''' input = df[colName] ps = nltk.stem.PorterStemmer() # Init Poter Stemmer tknzr = TweetTokenizer() # Init Tweet Tokenizer allText = [i for i in input] ## The detail preprocessing step is in the report preprocessedText = [[ ps.stem(word) for word in tknzr.tokenize( re.sub( r'\d+', '', re.sub(r"http\S+|www.\S+", matchingURL, sentence)).lower()) if word not in nltk.corpus.stopwords.words('english') and len(word) >= 3 ] for sentence in allText] df[colName] = preprocessedText def fillingNullValue(colName): ''' The function used for replace the 'nan' in the dataset ''' if args.preprocessingStra[colName][ 'fillingNullMethod'] == filling_method.MOST_COMMON: ## replace the nan by mean df[colName] = df[colName].astype('float') df[colName].fillna(df[colName].mean(), inplace=True) elif args.preprocessingStra[colName][ 'fillingNullMethod'] == filling_method.MEAN: ## replace the nan by the most common value df[colName] = df[colName].astype('category') df[colName].fillna( df[colName].astype('category').describe()['top'], inplace=True) elif args.preprocessingStra[colName][ 'fillingNullMethod'] == filling_method.CERTAIN_VALUE: ## replace the nan by a certain value df[colName] = df[colName].astype('category') df[colName] = df[colName].cat.add_categories( [args.preprocessingStra[colName]['fillingNullValue']]) df[colName].fillna( args.preprocessingStra[colName]['fillingNullValue'], inplace=True) def TweetsWithUserInfoPreprocessing(): ''' Perform the preprocessing here ''' for colName in args.preprocessingStra.keys(): resultTextbox.insert( "end", ("Preprocessing feature: " + str(colName) + "\n")) window.update_idletasks() for step in args.preprocessingStra[colName]['steps']: if not step is None: step(colName) ############### Preprocessing Strategy ############### args.preprocessingStra = defaultdict(dict) args.preprocessingStra['text']['steps'] = [ preprocessingInputTextData ] args.preprocessingStra["numberOfHashtags_c"]['steps'] = [None] args.preprocessingStra['favorite_count']['steps'] = [None] args.preprocessingStra['retweet_count']['steps'] = [None] args.preprocessingStra['possibly_sensitive'] = { 'fillingNullMethod': filling_method.CERTAIN_VALUE, 'fillingNullValue': 'UNKNOWN', 'steps': [fillingNullValue], } args.preprocessingStra['followers_count']['steps'] = [None] args.preprocessingStra['friends_count']['steps'] = [None] args.preprocessingStra['default_profile']['steps'] = [None] args.preprocessingStra['default_profile_image']['steps'] = [None] args.preprocessingStra['favourites_count']['steps'] = [None] args.preprocessingStra['listed_count']['steps'] = [None] args.preprocessingStra['statuses_count']['steps'] = [None] args.preprocessingStra['verified']['steps'] = [None] resultTextbox.insert("end", ('Preprocessing Strategy Set\n')) window.update_idletasks() ############################################################# resultTextbox.insert("end", ('Start Preprocessing...\n')) window.update_idletasks() TweetsWithUserInfoPreprocessing() # Apply inplace preprocessing # Get dummy variable df = pd.get_dummies(df, drop_first=True, columns=[ 'possibly_sensitive', 'default_profile', 'default_profile_image', 'verified' ]) # Save the preprocessed-df with open( os.path.join(args.dataset, args.pickle_name_beforeMapToIdx), "wb") as fp: pickle.dump(df, fp) resultTextbox.insert( "end", ("The Pickle Data beforeMapToIdx Dumped to: " + str( os.path.join(args.dataset, args.pickle_name_beforeMapToIdx)) + "\n")) window.update_idletasks() else: # If the preprocessed-df exist, load it. print("Loading Existing BeforeMapToIdx file for Tweets and User: "******"end", ( "Loading Existing BeforeMapToIdx file for Tweets and User: "******"\n")) window.update_idletasks() with open( os.path.join(args.dataset, args.pickle_name_beforeMapToIdx), "rb") as fp: df = pickle.load(fp) #################### After having the pre-processed df #################### resultTextbox.insert("end", ('Spliting Datasets...\n')) window.update_idletasks() ## split the df to training, validation and test set. if args.runningOnSmallDataset: # If this user want to test the program on the small dataset, # do a fake split to have a smaller dataset. X_temp (will be deleted later) will have 98% of data, and the small dataset size is 2% # Fake split X_temp, X_train, Y_temp, Y_train = train_test_split( df.drop('maliciousMark', axis=1), df['maliciousMark'], test_size=0.02, stratify=df['maliciousMark'], random_state=args.random_seed) # get real training and test set. X_train, X_test, Y_train, Y_test = train_test_split( X_train, Y_train, test_size=args.validation_portion, stratify=Y_train, random_state=args.random_seed) # delete X_temp and Y_temp del X_temp del Y_temp else: # if not running on the small dataset, do normal data split. X_train, X_test, Y_train, Y_test = train_test_split( df.drop('maliciousMark', axis=1), df['maliciousMark'], test_size=args.validation_portion, stratify=df['maliciousMark'], random_state=args.random_seed) X_validation, X_test, Y_validation, Y_test = train_test_split( X_test, Y_test, test_size=args.test_portion, stratify=Y_test, random_state=args.random_seed) ## Show the datasets' sizes resultTextbox.insert( "end", ("Dataset Size: " + str(len(X_train) + len(X_validation) + len(X_test)) + "\n")) resultTextbox.insert("end", ("TrainingSet Size: " + str(len(X_train)) + "\n")) resultTextbox.insert( "end", ("ValidationSet Size: " + str(len(X_validation)) + "\n")) resultTextbox.insert("end", ("TestSet Size: " + str(len(X_test)) + "\n")) window.update_idletasks() resultTextbox.insert("end", ('Creating Tweets_text...\n')) window.update_idletasks() ## create nltk.Text, which will be used as a dictionray. tweets_text = nltk.Text(list(itertools.chain(*X_train['text']))) # check if the hyper-parameter vocab_size exists and filter out the low tf words. args.vocab_size = args.vocab_size or len(tweets_text.tokens) if args.vocab_size: # and this if expression tweets_text.tokens = specialTokenList + \ [w for w, _ in tweets_text.vocab().most_common( args.vocab_size - len(specialTokenList))] else: tweets_text.tokens = specialTokenList + tweets_text.tokens args.vocab_size = len(tweets_text.tokens) # change the vacab_size ## Map the terms to index for every dataset resultTextbox.insert("end", ('Maping Word To Idx: training set\n')) window.update_idletasks() X_train['text'] = mapFromWordToIdx(X_train['text'], tweets_text) resultTextbox.insert("end", ('Maping Word To Idx: validation set\n')) window.update_idletasks() X_validation['text'] = mapFromWordToIdx(X_validation['text'], tweets_text) resultTextbox.insert("end", ('Maping Word To Idx: test set\n')) window.update_idletasks() X_test['text'] = mapFromWordToIdx(X_test['text'], tweets_text) resultTextbox.insert("end", ('Creating Torch Training Datasets...\n')) window.update_idletasks() # args.X_train = X_train # args.Y_train = Y_train # Create training, validation and test Pytorch dataset for feeding data into the Pytorch Neural Network # More datails are in the utils.py CreateTweetsWithUserInfoDataset function training_dataset = CreateTweetsWithUserInfoDataset( X_train, list(map(int, list(Y_train)))) resultTextbox.insert("end", ('Creating Torch Validation Datasets...\n')) window.update_idletasks() validation_dataset = CreateTweetsWithUserInfoDataset( X_validation, list(map(int, list(Y_validation)))) resultTextbox.insert("end", ('Creating Torch Test Datasets...\n')) window.update_idletasks() test_dataset = CreateTweetsWithUserInfoDataset( X_test, list(map(int, list(Y_test)))) resultTextbox.insert("end", ('Dumping data...\n')) window.update_idletasks() # Dump the pre-processed datasets with open(os.path.join(args.dataset, args.pickle_name), "wb") as fp: # Pickling pickle.dump([ training_dataset, validation_dataset, test_dataset, tweets_text ], fp) print("The Pickle Data Dumped to: ", os.path.join(args.dataset, args.pickle_name)) resultTextbox.insert( "end", ("The Pickle Data Dumped to: " + str(os.path.join(args.dataset, args.pickle_name)) + "\n")) window.update_idletasks() else: # If the pre-processed datasets exist, load it. resultTextbox.insert( "end", ("Loading Existing File: " + str(os.path.join(args.dataset, args.pickle_name)) + '\n')) window.update_idletasks() with open(os.path.join(args.dataset, args.pickle_name), "rb") as fp: # Unpickling training_dataset, validation_dataset, test_dataset, tweets_text = pickle.load( fp) ## Some dataframe hyper-parameters that will be used latter args.vocab_size = len(tweets_text.tokens) args.num_extra_info = len(training_dataset[0][1]) args.num_features = len(training_dataset[0][1]) + 1 # return the loaded or generated dataset. return training_dataset, validation_dataset, test_dataset, tweets_text
# Split the text into words and pass to NLTK rawtext = "".join(progtext) tokens = rawtext.split() #for token in tokens: # if token.lower() in exclusions: # tokens.pop(tokens.index(token)) #tokens = nltk.word_tokenize(rawtext) #tokens = nltk.regexp_tokenize(rawtext,pattern) newtokenlist = list() for token in tokens: newtokenlist.append(spellingFixer(token)) nltktext = nltk.Text(newtokenlist) collocations = nltktext.collocations() word_fd = FreqDist(tokens) index = 0 print "\nPopular words:" for entry in word_fd: if re.match("\W", entry) != None or entry.lower() in exclusions: index -= 1 # Ignore this one, it's just symbols else: print entry index += 1 if index == 10: break print "\nPopular topics:" bigram_fd = FreqDist(nltk.bigrams(tokens))
stopwords = nltk.corpus.stopwords.words('english') texts = [[word for word in abstract.lower().split() if word not in stopwords] for abstract in absCl] phrases = Phrases(texts) bigram = Phraser(phrases) trigram = Phrases(bigram[texts]) trigram = Phraser(trigram) texts = [trigram[bigram[text]] for text in texts] y = [] i = 0 while i < len(texts): topics = TokenSearcher(nltk.Text(texts[i])).findall( r'<.*addict.*|opioid_use|.*dependence.*|.*abuse.*|.*abuse|.*alcoholi.*|.*inject_drugs|people_inject.*|drugs_people|.*sober.*|.*misuse.*|.*detox.*|.*heroin.*|hepatitis|.*illicit.*|.*overdose.*|drug_use|drug_use.*|substance_use|treatment_facility|recovering.*> <.*>' ) if topics: y.append([texts[i], topics[:], 'Addiction/Abuse']) elif not topics: topics = TokenSearcher(nltk.Text(texts[i])).findall( r'<anesthe.*|.*anesthe.*|.*anesthe|icu|.*perioper.*|.*arthroplasti.*|.*postop.*|.*inpatient.*|.*outpatient.*|sevoflurane|midazolam|.*epidural.*|ropivacaine|.*cancer.*|.*surgic.*|.*surger.*|.*cesarean.*|.*caesarean.*|.*lymphoma.*|.*laparoscop.*|dexmedetomidin|.*sedat.*|.*operat.*|.*endoscop.*|.*radiolo.*|.*paracetamol.*> <.*>' ) if topics: y.append([texts[i], topics[:], 'Medical Procedure']) elif not topics: topics = TokenSearcher(nltk.Text(texts[i])).findall( r'<.*pain.*|acetaminophen|.*analgesic.*|.*analgesi.*|ropivacain|.*antinocicept.*|.*nocicep.*|.*inflamm.*|.*epidural.*|.*formalin.*|.*fentanyl.*|oxycodone|remifentanil|.*hyperalgesia.*|nerve_block.*|gabapentin|kappa_opioid|pallative_care|.*paracetamol.*> <.*>' ) if topics:
reload(sys) sys.setdefaultencoding('utf-8') # 찾으려고 하는 상위 n개의 단어(명사) _toFind_ = 30 # 문서 읽기 doc_ko = open('./k_tex2.txt').read() # print(doc_ko) # 명사만 추출 token_ko = tw.nouns(doc_ko) # nltk 활용을 위한 res_ko = nltk.Text(token_ko, name=u'sutuk1') print(len(res_ko.tokens)) # returns number of tokens (document length) print(len(set(res_ko.tokens))) # returns number of unique tokens on_list = res_ko.vocab().most_common(_toFind_) # on_list 는 리스트이다, most_common 이 리스트를 반환하는데 리스트는 튜플로 이루어져있다, 튜플은 첫번째인자로(0번쨰) 유니코드 스트링형을 갖고 두번째인자로(1번째) 몇번 빈출되었는지 # print(list(on_list[1])[0]) 테스트코드 to_list = list() # 리스트안에 있는 튜플을 리스트로 바꾼 후 그것중 0번째 인자인 스트링형만 리스트에 다시 담는 작업을 하는 부분이다 for a in range(0, len(on_list)): to_list.append(list(on_list[a])[0])
def datacode(self): global image_File global stop_words filename = fname[0].split('/') f_stack = len(filename) taglist = filename[-1].split('_') searchtext = taglist[1] barValue = 0 myWindow.pBar.setValue(barValue) myWindow.pBar.setMaximum(100) with open(filename[f_stack - 1], 'rt', encoding='utf-8') as rfile: with open('Processing_' + filename[f_stack - 1], 'w', newline='') as wfile: cw = csv.writer(wfile) r = csv.reader(rfile) for row in r: result_News = '' check = 0 for c in row[3]: if ord('[') == ord(c): check = 1 elif ord(']') == ord(c): check = 0 continue if check == 0: if ord('가') <= ord(c) <= ord('힣') or c.isdigit() or ord('A') <= ord(c) <= ord('z') \ or ord(c) == ord(' '): result_News += c else: result_News += ' ' cw.writerow([result_News]) barValue = 10 myWindow.pBar.setValue(barValue) image_File = 'Processing_' + filename[f_stack - 1] + '.png' with open('Processing_' + filename[f_stack - 1], 'r') as f: text = f.read() okt = Okt() barValue = 20 myWindow.pBar.setValue(barValue) nouns = okt.nouns(text) barValue = 80 myWindow.pBar.setValue(barValue) stop_words = [ searchtext, '뉴스', '금지', '제공', '무단', '전재', '배포', '기자', '구독', '뉴시스', '연합뉴스', '사진', '저작권', '라며', '디스패치', '노컷뉴스', '네이버', '생방송투데이', '매일신문' ] nouns = [ each_word for each_word in nouns if each_word not in stop_words and len(str(each_word)) > 1 ] nns = nltk.Text(nouns, name='process_') wordChart = dict(nns.vocab().most_common(30)) wordInfo = dict(nns.vocab().most_common(400)) barValue = 85 myWindow.pBar.setValue(barValue) MyWindow.showChart(wordChart, image_File) barValue = 90 myWindow.pBar.setValue(barValue) MyWindow.saveWordCloud(wordInfo, image_File) barValue = 100 myWindow.pBar.setValue(barValue) myWindow.Search.setEnabled(True) myWindow.Process.setEnabled(True)
""" import requests import bs4 import names import nltk API_KEY = "7be7045a0a544e10916fcb867df3010d" query = "Lava Jato" url = ('https://newsapi.org/v2/everything?' 'q={}&' 'apiKey={}'.format(query, API_KEY)) response = requests.get(url) txt = response.json()["articles"] text = "" for i in range(len(txt)): url_art = txt[i]["url"] html = requests.get(url_art).text soup = bs4.BeautifulSoup(html, "lxml") ps = soup.find_all("p") for p in ps: text = text + "\n" + p.get_text() nomes = names.get_human_names(text) print(nomes) texto_anal = nltk.Text(text.split())
def main(argv, matches=2): fName = 'bbc/politics/' + str(argv) f = open(fName, 'r') raw_text = f.read() # Tokenize the tokenized_words of the text tokenized_words = nltk.word_tokenize(raw_text) # Making the tokenized_words to lower case for i in range(len(tokenized_words)): tokenized_words[i] = tokenized_words[i].lower() # POS tag the words tagged_words = nltk.pos_tag(tokenized_words) # Extracting the tags of the text tags = set([tag for (word, tag) in tagged_words]) word_tag_dict = {} tag_word_dict = {} for (word, tag) in tagged_words: if word in word_tag_dict.keys(): word_tag_dict[word.lower()].append(tag) else: word_tag_dict[word.lower()] = [tag] if tag in tag_word_dict.keys(): tag_word_dict[tag].append(word) else: tag_word_dict[tag] = [word] words = nltk.Text(tokenized_words) doc = nltk.ConcordanceIndex(words) stemmer = PorterStemmer() # # Call text Rank # sorted_text_rank = textRank(tokenized_words, tag_word_dict) # set1 = set([w.lower() for (w, val) in sorted_text_rank[:15]]) # removeList = [] # for w in set1: # if stemmer.stem(w) != w and stemmer.stem(w) in set1: # removeList.append(w) # for w in removeList: # set1.remove(w) # sorted_text_rank = [(w, val) for (w, val) in sorted_text_rank[:15] if w not in removeList] # offset_dict_text_rank = {} # for words1 in set1: # offset_dict_text_rank[words1] = doc.offsets(words1) # Call tf sorted_tfValues = tf(tokenized_words, word_tag_dict) set2 = set([w.lower() for (w, val) in sorted_tfValues[:15]]) removeList = [] for w in set2: if stemmer.stem(w) != w and stemmer.stem(w) in set2: removeList.append(w) for w in removeList: set2.remove(w) sorted_tfValues = [(w, val) for (w, val) in sorted_tfValues[:15] if w not in removeList] offset_dict_tf = {} for words2 in set2: offset_dict_tf[words2] = doc.offsets(words2) # # Call tf-idf # sorted_tf_idf = tfIdf (raw_text, word_tag_dict) # set3 = set([w for (w, val) in sorted_tf_idf[:15]]) # removeList = [] # for w in set3: # if stemmer.stem(w) != w and stemmer.stem(w) in set3: # removeList.append(w) # for w in removeList: # set3.remove(w) # sorted_tf_idf = [(w, val) for (w, val) in sorted_tf_idf[:15] if w not in removeList] # offset_dict_tf_idf = {} # for words3 in set3: # offset_dict_tf_idf[words3] = doc.offsets(words3) """ Printing the resuts""" # print (raw_text) # print ("\n\nText Rank of the document:") # printResult (sorted_text_rank, word_tag_dict, offset_dict_text_rank) # printTable (sorted_text_rank, offset_dict_text_rank) # printMatrix (offset_dict_text_rank) print("\n\nTf Scores of the document:\n") printResult(sorted_tfValues, word_tag_dict, offset_dict_tf) out_list, tid_word_dict = printTable(sorted_tfValues, offset_dict_tf) words_list = printMatrix(offset_dict_tf) print_top_sentence(raw_text, sorted_tfValues, matches, out_list, tid_word_dict, words_list) print_sentences(raw_text, sorted_tfValues, tid_word_dict, words_list)
if (minimum[word] > count): minimum[word] = count counts.append(len(gutenberg.words(fileid))) output.append(counts) print tabulate(output, headers="firstrow") difference = [] for word in wordList: difference.append([word, maximum[word] - minimum[word]]) print tabulate(difference) # could and will #'will': 0.004993612820810591 shakespeare-caesar.txt #'will': 0.0003591094086665071 blake-poems.txt # someone's Will is mentioned t = nltk.Text(nltk.corpus.gutenberg.words('shakespeare-caesar.txt')) t.concordance("will") t = nltk.Text(nltk.corpus.gutenberg.words('blake-poems.txt')) t.concordance("will") #'could': 0.004522720559024559 austen-persuasion.txt #'could': 0.00016326062134024106 bible-kjv.txt # bible mostly only uses for 'could not' t = nltk.Text(nltk.corpus.gutenberg.words('austen-persuasion.txt')) t.concordance("could") t = nltk.Text(nltk.corpus.gutenberg.words('bible-kjv.txt')) t.concordance("could")
# 杏 URL = 'http://dic.nicovideo.jp/a/%E5%8F%8C%E8%91%89%E6%9D%8F' # 주소 user_agent = "anzu" overloadText = "" response = rq.get(URL, headers={'User-Agent': user_agent}) soup = BeautifulSoup(response.text, 'html.parser') # 전체 HTML에서 특정 부분만 찾기 위한 html parsing anzuArticle = soup.findAll('p') # 닉네임 전부 가져옴 for index in anzuArticle: overloadText = overloadText + index.text tokens_jp = mecab_analysis( overloadText) # konlpy Package의 t를 이용하여 단어 나눔, "形容詞", "動詞","名詞", "副詞"만 필요 jp = nltk.Text(tokens_jp, name='杏') # 중복 제거를 위해 token형식으로 만듬 data = jp.vocab().most_common(500) # 정렬된 list-tuple형식으로 변경(Count해줌. 최대 500)( tmp_data = dict(data) # dict형식으로 데이터 변경 stop_words = [ 'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'こと', 'これ', 'さん', 'して', 'くれる', 'やる', 'くださる', 'そう', 'せる', 'した', '思う', 'それ', 'ここ', 'ちゃん', 'くん', '', 'て', 'に', 'を', 'は', 'の', 'が', 'と', 'た', 'し', 'で', 'ない', 'も', 'な', 'い', 'か', 'ので', 'よう', '[', ']', '/' ] # Python 3.0부턴 유니코드 표현을 위해 앞에 u를 붙일 필요가 없음(이미 유니코드로 표현되므로) anzu_coloring = np.array(Image.open("WordCloudData/anzu.jpg")) wc = WordCloud(font_path="/Library/Fonts/Hannari.otf", max_words=2000,
Containing the sequence of letters pt Having all lowercase letters except for an initial capital (i.e., titlecase) ''' print( sorted(w for w in set(text6) if w.endswith('ise') and 'z' in w and 'pt' in w and w.istitle())) ''' CAPITULO 2 ''' import nltk print(nltk.corpus.gutenberg.fileids()) emma = nltk.corpus.gutenberg.words('austen-emma.txt') print('Cantidad de palabras en el texto Emma', len(emma)) emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt')) print('Concordancia de surprize', emma.concordance("surprize")) print( 'average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score).' ) for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set(w.lower() for w in gutenberg.words(fileid))) print(round(num_chars / num_words), round(num_words / num_sents), round(num_words / num_vocab), fileid) from nltk.corpus import webtext for fileid in webtext.fileids():
nouns = [word for (word, pos) in nltk.pos_tag(text2) if is_noun(pos)] V = set(nouns) long_words1 = [w for w in tokens if 4 < len(w) < 10] fdist01 = nltk.FreqDist(long_words1) a1 = fdist01.most_common(40) def lexical_diversity(text): return len(set(text)) / len(text) vocab = set(text) vocab_size = len(vocab) V = set(text) long_words = [w for w in tokens if 4 < len(w) < 13] text2 = nltk.Text(word.lower() for word in long_words) fdist1 = nltk.FreqDist(long_words) a = fdist1.most_common(15) import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import matplotlib.pyplot as plt from gensim import corpora from string import punctuation def strip_punctuation(s): return ''.join(c for c in s if c not in punctuation) documents = [
import json import sys import nltk import datetime import numpy as np from keras import models, layers, optimizers, losses, metrics target = "json_files/ratings_train_dset.json" # Loading json file with open(target) as f: train_target = json.load(f) # Only tokens train_tokens = [tok for d in train_target for tok in d[0]] train_text = nltk.Text(train_tokens, name="train_t") # Make selected_word <- This is used to make vector common_tests = train_text.vocab().most_common(500) selected_word = [word[0] for word in common_tests] def term_frequency(doc): return [doc.count(s_word) for s_word in selected_word] # Vectorization # x is input (term), and it is dataset train_x = [term_frequency(d) for d, _ in train_target] # y is output (0 or 1), and it is dataset train_y = [c for _, c in train_target]
def user_review(request, reviewee_id): if request.method not in ['GET', 'POST']: return HttpResponseNotAllowed(['GET', 'POST']) if request.user.is_authenticated: if request.method == 'GET': reviewee_user = get_object_or_404(User, pk=reviewee_id) review_list = Review.objects.filter(reviewee=reviewee_user) json_review_list = [] for review in review_list: json_review_list.append({'rating': review.rating, 'content': review.content}) random.shuffle(json_review_list) return JsonResponse( json_review_list, status=200, safe=False ) # when request.method == 'POST': reviewee_user = get_object_or_404(User, pk=reviewee_id) try: req_data = json.loads(request.body.decode()) content = req_data['content'] except (KeyError, TypeError, ValueError, JSONDecodeError): return HttpResponse(status=400) # sentiment analysis model_path = os.path.join(os.getcwd(), 'review', 'nsmc', 'sentiment_model.h5') model = tf.keras.models.load_model(model_path) docs_path = os.path.join(os.getcwd(), 'review', 'nsmc', 'train_docs.json') with open(docs_path) as file: train_docs = json.load(file) tokens = [t for d in train_docs for t in d[0]] text = nltk.Text(tokens, name='NMSC') okt = Okt() selected_words = [f[0] for f in text.vocab().most_common(10000)] rating = sent.predict_score(model, okt, selected_words, content) new_review = Review(reviewee=reviewee_user, rating=rating, content=content) new_review.save() # Apply new rating to corresponding user review_list_len = Review.objects.filter(reviewee=reviewee_user).count() reviewee_user_profile = Profile.objects.get(user=reviewee_user) prev_rating = reviewee_user_profile.rating new_rating = (prev_rating * review_list_len + rating) / (review_list_len + 1) reviewee_user_profile.rating = round(new_rating, 2) reviewee_user_profile.save() return JsonResponse({'id': new_review.id, 'rating': rating, 'content': content}, status=201) # If user is not logged in return HttpResponse(status=401)
print(c) num(6, 10) num(20, 4) #nltk import nltk from nltk import word_tokenize from nltk.book import* text1.count("give") my_pers=text4.count("love")/len(text4) pers=print(round(my_pers*100, 2), 'percent') #processing my own texts on nltk prose=open('F:\OneDriveGSU\OneDrive - Georgia State University\Python_practice\Julian.txt', 'r').read() #open and read the txt file my_prose=word_tokenize(prose) #tokenize the open and read file my_prose1=nltk.Text(my_prose) #convert the tokenized text into an nltk text my_prose1.concordance("I") #start processign the text dectic=['there', 'here', 'then', 'that'] #specified the things I want to find out my_prose2=prose.split()#turning the text into a list for word in my_prose2: #for every item in the list if word in dectic: #if it's in dectic myNum=my_prose2.count(word) #count their number and assign it to a variable print(myNum) #print it #from nltk: #doesn't work def word_vowels(words): words1=words.split()#turning the input into a list count=0 my_lis=['a', 'e', 'i', 'o', 'u']#specifying the list of vowels for word in words1: #for each word in the list for i in my_lis: #for each item in the vowel list
def nltk(self): import nltk return nltk.Text(self.tokens)
def kwic(self, word): txt = nltk.Text(self.tokens) return txt.concordance(word)
def get_restrictive_intensifier(): # 1. Make set of noun and adjective which has intensity concept separately from reference corpus (Gutenberg) noun_set = [] adj_set = [] intensity_keyword = ['degree', 'intensity'] # Check whether data is produced already or not. Load the data if data is produced. if os.path.isfile('noun_set.txt') and os.path.isfile('adj_set.txt'): with open('noun_set.txt', 'rb') as f: noun_set = pickle.load(f) with open('adj_set.txt', 'rb') as f: adj_set = pickle.load(f) print("noun_set and adj_set data exists. Loaded data\n") # If data is not produced yet, start the process. Make noun_set and adj_set else: total_token = [] # Get tokenized words from gutenberg corpus for each_fileid in gutenberg.fileids(): each_tokenized_text = set(gutenberg.words(each_fileid)) # eliminate duplicated words for each_token in each_tokenized_text: total_token.append(each_token.lower()) # make all characters to lower case total_token = set(total_token) print('total words number for reference is %d' % len(total_token)) # for debugging total_token_with_tags = pos_tag(total_token) # tag part-of-speech to words for each_word in total_token_with_tags: # Select nouns among words if each_word[0].isalpha() and each_word[1] == "NN": found_keyword = False for synset in wn.synsets(each_word[0]): for each_keyword in intensity_keyword: if each_keyword in synset.definition(): found_keyword = True break if found_keyword: break if found_keyword: noun_set.append(each_word[0]) # Select adjectives among words if each_word[0].isalpha() and each_word[1] == "JJ": found_keyword = False for synset in wn.synsets(each_word[0]): for each_keyword in intensity_keyword: if each_keyword in synset.definition(): found_keyword = True break if found_keyword: break if found_keyword: adj_set.append(each_word[0]) # Save the produced data into file with open('noun_set.txt', 'wb') as f: pickle.dump(noun_set, f) with open('adj_set.txt', 'wb') as f: pickle.dump(adj_set, f) print('Saved noun_set and adj_set by pickle\n') print('nouns number is %d, adjective number is %d, total nouns and adjectives number is %d\n' % (len(noun_set), len(adj_set), len(noun_set) + len(adj_set))) print('First step is finished. Nouns & adjectives were selected.\n') # for debugging # 2. Make noun_pair_set. Find intensifier from the noun intensifier_keyword = ['intensifier'] gutenberg_fdists = dict() noun_pair_set = dict() total_intensifier_set = dict() # Check whether data is produced already or not. Load the data if data is produced. if os.path.isfile('gutenberg_fdists.txt') and os.path.isfile('noun_pair_set.txt') and os.path.isfile('total_intensifier_set_nouns.txt'): with open('gutenberg_fdists.txt', 'rb') as f: gutenberg_fdists = pickle.load(f) with open('noun_pair_set.txt', 'rb') as f: noun_pair_set = pickle.load(f) with open('total_intensifier_set_nouns.txt', 'rb') as f: total_intensifier_set = pickle.load(f) print("gutenberg_fdists and noun_pair_set, total_intensifier_set data exists. Loaded data\n") # If data is not produced yet, start the process. Make noun_pair_set else: # Create fdist for all of gutenberg corpus for each_fileid in gutenberg.fileids(): each_tokenized_corpus_text = nltk.Text(gutenberg.words(each_fileid)) gutenberg_fdists[each_fileid] = FreqDist(each_tokenized_corpus_text) count = 0 for each_noun in noun_set: # Search in gutenberg corpus for each_fileid in gutenberg.fileids(): each_tokenized_corpus = gutenberg.words(each_fileid) for each_index, each_token in enumerate(each_tokenized_corpus): # Find the corresponding noun in corpus if each_token.lower() == each_noun: # Check previous word of given noun if each_index != 0 and each_tokenized_corpus[each_index - 1].isalpha(): intensifier = each_tokenized_corpus[each_index - 1].lower() # Find wornet's lemmas and check if there are intensifier_keyword in definition of lemma for synset in wn.synsets(intensifier): found_intensifier = False for each_keyword in intensifier_keyword: if each_keyword in synset.definition(): # Found intensifier. Keyword is included in the synset definition # If there is already body word in noun_pair_set, but not found intensifier yet, then add it to existing body word element if each_noun in noun_pair_set: if intensifier not in noun_pair_set[each_noun][1]: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, each_noun) # Add it to the noun_pair_set noun_pair_set[each_noun][1][intensifier] = [bigram_frequency, 0] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if each_noun not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][each_noun] = bigram_frequency else: total_intensifier_set[intensifier] = {each_noun: bigram_frequency} # If there is no body word in noun_pair_set, then add body word, intensifiier pair into noun_pair_set else: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, each_noun) # Calculate the frequency of body_word alone bodyword_frequency = 0 for gutenberg_each_fileid in gutenberg.fileids(): bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(each_noun) # Add it to the noun_pair_set noun_pair_set[each_noun] = [bodyword_frequency, {intensifier: [bigram_frequency, 0]}] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if each_noun not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][each_noun] = bigram_frequency else: total_intensifier_set[intensifier] = {each_noun: bigram_frequency} # look for intensifier's lemma's example for each_example in synset.examples(): tokenized_example = word_tokenize(each_example) if intensifier in tokenized_example: next_word_idx = tokenized_example.index(intensifier) + 1 if next_word_idx < len(tokenized_example): next_word = tokenized_example[next_word_idx].lower() if next_word.isalpha(): # If there is already body word in noun_pair_set, but not found intensifier yet, then add it to existing body word element if next_word in noun_pair_set: if intensifier not in noun_pair_set[next_word][1]: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, next_word) # Add it to the noun_pair_set. Add default score 15 since it is found for synset's example noun_pair_set[next_word][1][intensifier] = [bigram_frequency, 15] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if next_word not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][next_word] = bigram_frequency else: total_intensifier_set[intensifier] = {next_word: bigram_frequency} # If there is no body word in noun_pair_set, then add body word, intensifiier pair into noun_pair_set else: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, next_word) # Calculate the frequency of body_word alone bodyword_frequency = 0 for gutenberg_each_fileid in gutenberg.fileids(): bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(next_word) # Add it to the noun_pair_set. Add default score 15 since it is found for synset's example noun_pair_set[next_word] = [bodyword_frequency, {intensifier: [bigram_frequency, 15]}] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if next_word not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][next_word] = bigram_frequency else: total_intensifier_set[intensifier] = {next_word: bigram_frequency} found_intensifier = True break if found_intensifier: break count+=1 print(str(count) + "\n") # for debugging # Save the produced data into file with open('gutenberg_fdists.txt', 'wb') as f: pickle.dump(gutenberg_fdists, f) with open('noun_pair_set.txt', 'wb') as f: pickle.dump(noun_pair_set, f) with open('total_intensifier_set_nouns.txt', 'wb') as f: pickle.dump(total_intensifier_set, f) print('Saved gutenberg_fdists, total_intensifier_set_nouns and noun_pair_set by pickle\n') print('Second step is finished. noun_pair_set (ditionary) is created and size is %d \n' %(len(noun_pair_set))) # for debugging # 3. Make adj_pair_set. Find intensifier from the adjective adj_pair_set = dict() # Check whether data is produced already or not. Load the data if data is produced. if os.path.isfile('adj_pair_set.txt') and os.path.isfile('total_intensifier_set_nouns_adjs.txt'): with open('adj_pair_set.txt', 'rb') as f: adj_pair_set = pickle.load(f) with open('total_intensifier_set_nouns_adjs.txt', 'rb') as f: total_intensifier_set = pickle.load(f) print("adj_pair_set and total_intensifier_set_nouns_adjs data exists. Loaded data\n") # If data is not produced yet, start the process. Make adj_pair_set else: count = 0 for each_adj in adj_set: # Search in gutenberg corpus for each_fileid in gutenberg.fileids(): each_tokenized_corpus = gutenberg.words(each_fileid) for each_index,each_token in enumerate(each_tokenized_corpus): # Find the corresponding adjective in corpus if each_token.lower() == each_adj: # Check provious word of given adjective if each_index != 0 and each_tokenized_corpus[each_index-1].isalpha(): intensifier = each_tokenized_corpus[each_index - 1].lower() # Find wornet's lemmas and check if there are intensifier_keyword in definition of lemma for synset in wn.synsets(intensifier): found_intensifier = False for each_keyword in intensifier_keyword: if each_keyword in synset.definition(): # Found intensifier. Keyword is included in the synset definition # If there is already body word in adj_pair_set, but not found intensifier yet, then add it to existing body word element if each_adj in adj_pair_set: if intensifier not in adj_pair_set[each_adj][1]: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, each_adj) # Add it to the adj_pair_set adj_pair_set[each_adj][1][intensifier] = [bigram_frequency, 0] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if each_adj not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][each_adj] = bigram_frequency else: total_intensifier_set[intensifier] = {each_adj: bigram_frequency} # If there is no body word in adj_pair_set, then add body word, intensifiier pair into adj_pair_set else: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, each_adj) # Calculate the frequency of body_word alone bodyword_frequency = 0 for gutenberg_each_fileid in gutenberg.fileids(): bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(each_adj) # Add it to the adj_pair_set adj_pair_set[each_adj] = [bodyword_frequency, {intensifier: [bigram_frequency, 0]}] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if each_adj not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][each_adj] = bigram_frequency else: total_intensifier_set[intensifier] = {each_adj: bigram_frequency} # look for intensifier's lemma's example for each_example in synset.examples(): tokenized_example = word_tokenize(each_example) if intensifier in tokenized_example: next_word_idx = tokenized_example.index(intensifier) + 1 if next_word_idx < len(tokenized_example): next_word = tokenized_example[next_word_idx].lower() if next_word.isalpha(): # If there is already body word in adj_pair_set, but not found intensifier yet, then add it to existing body word element if next_word in adj_pair_set: if intensifier not in adj_pair_set[next_word][1]: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, next_word) # Add it to the adj_pair_set. Add default score 15 since it is found for synset's example adj_pair_set[next_word][1][intensifier] = [bigram_frequency, 15] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if next_word not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][next_word] = bigram_frequency else: total_intensifier_set[intensifier] = {next_word: bigram_frequency} # If there is no body word in adj_pair_set, then add body word, intensifiier pair into adj_pair_set else: # Calculate the frequency of "intensifier body_word" bigram_frequency = find_frequency(intensifier, next_word) # Calculate the frequency of body_word alone bodyword_frequency = 0 for gutenberg_each_fileid in gutenberg.fileids(): bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(next_word) # Add it to the adj_pair_set. Add default score 15 since it is found for synset's example adj_pair_set[next_word] = [bodyword_frequency, {intensifier: [bigram_frequency, 15]}] # Add it to the total_intensifier_set to calculate frequency for later process if intensifier in total_intensifier_set: if next_word not in total_intensifier_set[intensifier]: total_intensifier_set[intensifier][next_word] = bigram_frequency else: total_intensifier_set[intensifier] = {next_word: bigram_frequency} found_intensifier = True break if found_intensifier: break count+=1 print(str(count) + "\n") # for debugging # Save the produced data into file with open('adj_pair_set.txt', 'wb') as f: pickle.dump(adj_pair_set, f) with open('total_intensifier_set_nouns_adjs.txt', 'wb') as f: pickle.dump(total_intensifier_set, f) print('Saved adj_pair_set and total_intensifier_set_nouns_adjs by pickle\n') print('Third step is finished. adj_pair_set (ditionary) is created and size is %d \n' % (len(adj_pair_set))) # for debugging # 4. Calculate Score and combine nouns_pair_set, ajd_pair_set into total_pair_set total_pair_set = [] # Check whether data is produced already or not. Load the data if data is produced. if os.path.isfile('adj_scored_pair_set.txt') and os.path.isfile('noun_scored_pair_set.txt') and os.path.isfile('total_pair_set.txt'): with open('adj_scored_pair_set.txt', 'rb') as f: adj_pair_set = pickle.load(f) with open('noun_scored_pair_set.txt', 'rb') as f: noun_pair_set = pickle.load(f) with open('total_pair_set.txt', 'rb') as f: total_pair_set = pickle.load(f) print("adj_scored_pair_set, noun_socred_pair_set and total_pair_set data exists. Loaded data\n") # If data is not produced yet, start the process. Calculate the score else: # Calculate score for nouns count = 0 for each_body_word, body_word_data in noun_pair_set.items(): body_word_frequency = body_word_data[0] for each_intensifier, intensifier_data in body_word_data[1].items(): bigram_frequency = intensifier_data[0] # Calculate the average frequency of intensifiers from other body words frequency_sum = 0 freqeuncy_count = 0 for each_body_word_2, each_body_word_frequency_2 in total_intensifier_set[each_intensifier].items(): if each_body_word_2 != each_body_word: freqeuncy_count += 1 frequency_sum += each_body_word_frequency_2 frequency_avg = 0 if freqeuncy_count != 0: frequency_avg = frequency_sum/freqeuncy_count # Calculate the score and save it score = calculate_score(body_word_frequency, bigram_frequency, frequency_avg) intensifier_data[1] += score # Add data into total_pair_set total_pair_set.append((each_intensifier, each_body_word, intensifier_data[1])) count+=1 print(str(count) + "\n") # for debugging print('Noun score calcuating finish\n') # for dubugging # Calculate score for adjectives count=0 for each_body_word, body_word_data in adj_pair_set.items(): body_word_frequency = body_word_data[0] for each_intensifier, intensifier_data in body_word_data[1].items(): bigram_frequency = intensifier_data[0] # Calculate the average frequency of intensifiers from other body words frequency_sum = 0 freqeuncy_count = 0 for each_body_word_2, each_body_word_frequency_2 in total_intensifier_set[each_intensifier].items(): if each_body_word_2 != each_body_word: freqeuncy_count += 1 frequency_sum += each_body_word_frequency_2 frequency_avg = 0 if freqeuncy_count != 0: frequency_avg = frequency_sum/freqeuncy_count # Calculate the score and save it score = calculate_score(body_word_frequency, bigram_frequency, frequency_avg) intensifier_data[1] += score # Add data into total_pair_set total_pair_set.append((each_intensifier, each_body_word, intensifier_data[1])) count+=1 print(str(count) + "\n") # for debugging print('Adjective score calcuating finish\n') # for debugging # Sort the total_pair_set by score total_pair_set = set(total_pair_set) total_pair_set = sorted(total_pair_set, key=lambda each_pair: each_pair[2], reverse=True) print('Total pair set sorting finish\n') # for debugging # Save the produced data into file with open('adj_scored_pair_set.txt', 'wb') as f: pickle.dump(adj_pair_set, f) with open('noun_scored_pair_set.txt', 'wb') as f: pickle.dump(noun_pair_set, f) with open('total_pair_set.txt', 'wb') as f: pickle.dump(total_pair_set, f) print('Saved adj, nouns scored sets and total_pair_set by pickle\n') print('Fourth step is finished. adj_pair_set and noun_pair_set is scored, and total_pair_set is created and sorted \n') # for debugging # 5. Print out result in csv file hundread_selected_result = total_pair_set[:100] f = open('result.csv', 'w') # make "result.csv" file for each_pair in hundread_selected_result: f.write(each_pair[0] + ',' + each_pair[1] + '\n') print('Fifth step is finished. Selected 100 most unique intensifier-body word sets and wrote it on csv file \n') # for debugging
def tokenize(self): tokens = nltk.word_tokenize(self._text) self._nltk_text = nltk.Text(tokens)
test_words = word_extractor.extract() test_score = { word: score.cohesion_forward for word, score in test_words.items() } tokenizer = LTokenizer(scores=test_score) test_list = [] cnt = 0 for sent in x_test: test_list.append([tokenizer.tokenize(sent)]) cnt += 1 train_tokens = [token for data in train_list for token in data[0]] test_tokens = [token for data in test_list for token in data[0]] train_text = nltk.Text(train_tokens) test_text = nltk.Text(test_tokens) print('=====================selecting token======================') #시간 개오래걸림; selected_tokens_1 = [t[0] for t in train_text.vocab().most_common(500) ] #출현 빈도가 높은 상위 10000개의 토큰 선택 selected_tokens_2 = [t[0] for t in test_text.vocab().most_common(500)] #벡터화 -> BOW(Bag of Words) def term_frequency1(data): return [data.count(word) for word in selected_tokens_1] def term_frequency2(data): return [data.count(word) for word in selected_tokens_2]
#Tarot NLP Basic stats #req python3 import nltk data_dir = "data" rw_cards = [ "fool", "magician", "high priestess", "empress", "emperor", "hierophant","lovers", "chariot", "strength","hermit", "wheel of fortune", "justice", "hanged man", "death", "temperance", "devil", "tower", "star", "moon", "sun", "judgement", "world"] all_files = ["".join(open("{}/{}_clean.txt".format(data_dir, name.replace(" ","_")),"r").readlines()) for name in rw_cards] all_files_concat = "".join(all_files) master_vocab = sorted( set( [word.lower() for word in nltk.Text(nltk.word_tokenize(all_files_concat))] ) ) def process_raw(raw): tokens = nltk.word_tokenize("".join(raw)) txt = nltk.Text(tokens) words = [word.lower() for word in txt] fdist = nltk.FreqDist(words) return fdist #Generate distribution for full corpus text. all_lines = "".join([line for f in all_files for line in f]) stats = {} names_copy = list(rw_cards)
" Documentos Relevantes: " + str(len(datos_binario[1])) + "/" + str(len(qrels[1])), font=dict(family='Arial', size=12, color='rgb(50,50,50)'), showarrow=False)) layout['annotations'] = annotations fig = dict(data=data, layout=layout) name = name + ".html" plot(fig, filename=name) #------------------------------------------------------------------ doc = nltk.Text( nltk.regexp_tokenize( open( "/Users/gabriel/Documents/INAOE/2do Cuatrimestre/Recuperacion/corpus/cacm/cacm.all", "r").read(), "[A-Za-z'.]+")) palabras = [] titulos = [] titulo = [] flag = 0 for word in doc: if word == ".T": flag = 1 if (word == ".B" and flag == 1) or (word == ".W" and flag == 1) or (word == ".A" and flag == 1): flag = 0 titulos.append(titulo) titulo = []
def process_raw(raw): tokens = nltk.word_tokenize("".join(raw)) txt = nltk.Text(tokens) words = [word.lower() for word in txt] fdist = nltk.FreqDist(words) return fdist
def features(sentence): words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) # takes in: line of text made of WORDs # returns: dictionary in form {'contains(WORD)': None} def notfeatures(sentence): words = sentence.lower().split() return dict(('contains(%s)' % w, None) for w in words) #reading wikipedia articles on apples and Apple fruitText = open("apple-fruit.txt").read() fruitTokens = re.split(r'\W+', fruitText) fruitWords = nltk.Text(fruitTokens) computersText = open("apple-computers.txt").read() computersTokens = re.split(r'\W+', computersText) computersWords = nltk.Text(computersTokens) #calculating frequency distributions of words in wiki articles #this should be followed by other processes like getting rid of common words like "the" or "and" fruitWordsFreq = nltk.FreqDist(w.lower() for w in fruitWords) computersWordsFreq = nltk.FreqDist(w.lower() for w in computersWords) #choosing arbitrlly only a subset of words for training the model fruitFeatures = list(map(features, fruitWordsFreq.keys()[2:100])) computersFeatures = list(map(notfeatures, computersWordsFreq.keys()[2:2000])) #creating and training Positive Naive Bayes model with features from wiki articles
return review_words words = [] review_num = 0 for review in korean_review_df["document"]: words += review_to_wordlist(review) review_num += 1 if review_num % 5000 == 0: print("review_num: %d" % (review_num)) len(words) pprint(words[:10]) import nltk words_nltk = nltk.Text(words, name="words in movie reviews") pprint(words_nltk.vocab().most_common(10)) import matplotlib matplotlib.rc('font', family='AppleGothic') # matplotlib inline words_nltk.plot(40) Non_Stop_words = [ "Noun", "Verb", "Adjective", "Adverb", "Excalmation", "koreanParticle" ] words_new = [w for w in words if w.split("/")[-1] in Non_Stop_words] pprint(words_new[:10]) words_nltk_new = nltk.Text(words_new, name="new words in movie reviews") pprint(words_nltk_new.vocab().most_common(10))
okt = Okt() def tokenize(doc): # norm은 정규화, stem은 근어로 표시하기를 나타냄 return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)] with open('./train_docs.json', encoding="utf-8") as f: train_docs = json.load(f) with open('./test_docs.json', encoding="utf-8") as f: test_docs = json.load(f) # train_doc에 존재하는 모든 태깅된 문자열을 담는다 tokens = [t for d in train_docs for t in d[0]] import nltk text = nltk.Text(tokens, name='NMSC') print(text) selected_size = 4000 selected_words = [f[0] for f in text.vocab().most_common(selected_size)] def term_frequency(doc): return [doc.count(word) for word in selected_words] def load_model(): # 저장된 모델이 있다면 그 모델 사용 model = None if os.path.isfile('./review_model.json') and os.path.isfile('./review_model_weight.h5'): json_file = open("./review_model.json", "r") model_json = json_file.read() json_file.close()
# PLEASE STAY LOW! text = [t.lower() for t in text][:-10] # Remove 'chapter i' strings regexp = re.compile(r'chapter \d') text = [t for t in text if not re.match(regexp, t)] # combine all the text together raw = ' '.join(text) print('type of the raw text' + str(type(raw))) # Here's the magic tokens = [t for t in nltk.word_tokenize(raw) if t not in (',', '“', '”', '"')] #tokens = [t for t in jieba.cut(raw) if t not in (',', '“', '”', '"')] test_ndarr = np.array(tokens) # a list of tokens print('current tokens size is ' + str(test_ndarr.shape)) distinct_tokens = set(tokens) lexical_richness = len(distinct_tokens) / len(tokens) from pylab import mpl mpl.rcParams['font.sans-serif'] = ['SimHei'] ntext = nltk.Text(tokens) # draw the picture of word/ offset # 典型的词分布图像 ntext.dispersion_plot( ['乐视', '资金', '变革', '生态', '布局', '硬件', '用户', '承诺', '责任', '质疑', '窒息', '歉意'])
nltk.download() # stopwords collection은 usually 텍스트마이닝에서 제외하는 단어를 모아놨다. # nltk lemmatization 은 worknet을 기반으로 분석한다. from nltk.corpus import gutenberg ids = gutenberg.fileids() # 옛것이라 라이센스 없어서 괜찮음 ㅎ text = gutenberg.open(ids[0]).read() # emma 로 분석을 시작해보자. nltk.download('punkt') from nltk import word_tokenize tokens = word_tokenize(text) tokens[:100] en = nltk.Text(tokens) #tokens = en.tokens # 모든 character를 나눈다. nltk.Text에 text를 넣으면. dic = en.vocab() en.plot(50) lower_tokens = [x.lower() for x in tokens] # 모든 character를 lower case로. en_lw = nltk.Text(lower_tokens) dic_lw = en_lw.vocab() words = list(dic_lw.keys()) # practice page: 9 en.concordance('Emma', lines=5) # concordance는 그 용어가 사용된 곳을 보여주는 용어 색인이다. en.similar('Emma') # frequency로 판별 앞뒤 맥락을 이용하여 en.collocations() # default값으로 몇개가 출력되는지 설정되어있다. (20)
""" Assignment 2 for "Applied Text Mining in Python" from University of Michigan on Coursera Part 1 - Analyzing Moby Dick """ import nltk # If you would like to work with the raw text you can use 'moby_raw' with open('moby.txt', 'r') as f: moby_raw = f.read() # If you would like to work with the novel in nltk.Text format you can use 'text1' moby_tokens = nltk.word_tokenize(moby_raw) text1 = nltk.Text(moby_tokens) """ Example 1 How many tokens (words and punctuation symbols) are in text1? """ def totalTokens(): return len(nltk.word_tokenize(moby_raw)) # or alternatively len(text1) totalTokens() """ Example 2 How many unique tokens (unique words and punctuation) does text1 have? This function should return an integer. """
from konlpy.tag import Okt okt = Okt() token_ko = okt.nouns(ko_con_text) # 불용어(stopword) : 빈도 수에 상관없이 분석에서 배제할 단어들 stop_word_file = 'stopword.txt' stop_file = open(stop_word_file, 'rt', encoding='utf-8') stop_words = [ word.strip() for word in stop_file.readlines()] # print(stop_words) token_ko = [each_word for each_word in token_ko if each_word not in stop_words] # nltk : national language toolkit # token : 작은 절편 import nltk ko = nltk.Text(tokens=token_ko) wordlist = list() # 튜플(단어, 빈도수)를 저장할 리스트 # 가장 빈도수가 많은 500개만 추출 data = ko.vocab().most_common(500) # print(data) for word, count in data : if (count >= 50 and len(word) >= 2): wordlist.append((word, count)) visual = Visualization(wordlist) visual.makeWorCloud() visual.makeBarChart() print('finished')
# pip install twython # 환경 변수 java_home 설정 해야함 # python 과 java 의 bit가 동일 해야 함 doc1 = """배우 남궁민(39)이 SBS TV 드라마 '조작'(극본 김현정, 연출 이정흠)에 출연 확정했다고 소속사 935엔터테인먼트가 20일 밝혔다. '조작'은 사회 부조리를 파헤치는 기자들의 이야기를 그린다. 남궁민은 사고뭉치 기자 '한무영'을 맡는다. 기자였던 형이 비리를 고발하다 억울하게 죽는 모습을 본 후 복수를 위해 직접 기자가 된 인물이다. 소속사는 "전작 '김과장'이 많은 사랑을 받아 차기작을 결정하는 데 많은 고민이 있었다. '조작'은 '김과장' 때와 달리 남궁민의 진지하고 카리스마 넘치는 매력을 보여줄 드라마"라고 말했다. 한편 '조작'은 2015년 방송된 SBS 2부작 드라마 '너를 노린다'에서 호흡을 맞춘 이정흠 PD와 김현정 작가가 다시 한번 의기투합한 작품이다. 드라마는 '엽기적인 그녀' 후속으로 7월 방송 예정이다. """ from konlpy.corpus import kobill from konlpy.tag import Twitter t = Twitter() tokens_doc = t.morphs(doc1) import nltk ko = nltk.Text(tokens_doc, name='뉴스') #print(len(ko.tokens)) #print(len(set(ko.tokens))) print(list(set(ko.tokens))) keys = list(set(ko.tokens)) ko.vocab() #ko.plot(50) #print(ko.count(str('드라마'))) #print(len(tokens_doc)) count_arr = [] for to in tokens_doc: m = {} m[to] = str(ko.count(str(to))) count_arr.append(m) count_arr = set(count_arr) print(count_arr)