def remove_rare_often_word(texts, low_value, high_value): #removing frequent and rare words texts_tokenized = [simple_preprocess(doc) for doc in texts] dictionary = Dictionary(texts_tokenized) corpus = [dictionary.doc2bow(doc) for doc in texts_tokenized] tfidf = TfidfModel(corpus, id2word=dictionary) corpus_tfidf = tfidf[corpus] bad_words = [] for sent_tfidf in tqdm(corpus_tfidf, desc="selecting bad words"): bad_words += [ id for id, value in sent_tfidf if (value < low_value) or (value > high_value) ] dictionary.filter_tokens(bad_ids=bad_words) out_bow = [dictionary.doc2bow(doc) for doc in texts_tokenized] out_corpus = [] for doc in tqdm(out_bow, desc='Creating out corpus'): out_corpus.append([dictionary.get(id) for id, value in doc]) dict_tfidf = { dictionary.get(id): value for doc in corpus_tfidf for id, value in doc if (value >= low_value) and (value <= high_value) } return { 'texts': out_corpus, 'dict_tfidf': dict_tfidf, 'dictionary': dictionary }
def getNotImportantTokens(careersCorpus, jobsCorpus): careerPostFilter, jobPostFilter = [], [] dictionary = Dictionary([careersCorpus, jobsCorpus]) corpus = [dictionary.doc2bow(line) for line in [careersCorpus, jobsCorpus]] tfidf = TfidfModel(corpus) careersVector = tfidf[corpus[0]] jobsVector = tfidf[corpus[1]] sorted_careersVector = sorted(careersVector, key=lambda w: w[1], reverse=True) sorted_jobsVector = sorted(jobsVector, key=lambda w: w[1], reverse=True) for word_id, word_count in sorted_careersVector[:30]: careerPostFilter.append(dictionary.get(word_id)) for word_id, word_count in sorted_jobsVector[:30]: jobPostFilter.append(dictionary.get(word_id)) return careerPostFilter, jobPostFilter
def produce(self): print('Getting src docs') docs = [] doctokens = [] # aka Gensim's "text" stopwords = nltk.corpus.stopwords.words('english') for doc in self.src_doc_generator(): (doc_id,doc_label,doc_str) = doc docs.append(doc) doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords]) if len(docs) % 1000 == 0: print(len(docs)) print('Creating the dictionary') dictionary = Dictionary(doctokens) #dictionary.compactify() #dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating WORD') # aka Gensim's "dictionary" db.create_table('word') for word_id, word_str in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str)) print('Creating DOC and DOCWORD') db.create_table('doc') db.create_table('docword') for doc_idx, doc in enumerate(docs): db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2])) doc_id = doc[0] for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])): word_str = dictionary.get(word_id) # Is this valid? I believe it is. db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
def create_tf_idf(corporaPath): """ Compute the TF-IDF scores based on the entire corpus """ docs = [] ids = [] with open(corporaPath, "r") as csvfile: for line in csvfile: line = line.replace("\n", " ") parts = line.split(",") if (len(parts) == 5): id = parts[0] url = parts[1] comments = parts[2] if comments is not None: comments = clean_text(comments, tknzr_strip_users) else: comments = [] caption = parts[3] if caption is not None: caption = clean_text(caption, tknzr_strip_users) else: caption = [] tags = parts[4] if tags is not None: tags = clean_text(tags, tknzr_strip_users) else: tags = [] docs.append(comments + caption + tags) ids.append(id) idx_to_id = {} for i, id in enumerate(ids): idx_to_id[i] = id dct = Dictionary(docs) corpus = [dct.doc2bow(line) for line in docs] model = TfidfModel(corpus) tfidf_factors = {} for i, doc in enumerate(corpus): temp = {} for word_id, value in model[doc]: word = dct.get(word_id) temp[word] = value tfidf_factors[idx_to_id[i]] = temp return tfidf_factors
def make_tf_time_series(tweets_time_series, keep_only_common_words=True): tweets_time_series = break_up_sentences(tweets_time_series) tweets_dict = Dictionary(tweets_time_series) bow_time_series = [ tweets_dict.doc2bow(tweets) for tweets in tweets_time_series ] tf_time_series = [ make_term_frequency(time_step) for time_step in bow_time_series ] tf_time_series = [[(tweets_dict.get(tup[0]), tup[1]) for tup in time_step] for time_step in tf_time_series] if keep_only_common_words: tweets_dict.filter_extremes(no_below=len(tweets_time_series), no_above=1) tf_time_series = [[ tup for tup in time_step if tweets_dict.doc2idx([tup[0]])[0] != -1 ] for time_step in tf_time_series] return tf_time_series
def get_tokens_frequency_df(series): """ Count each time the same word appeared in the series and returns a dataFrame """ corpus_lists = [doc for doc in series.dropna() if doc] dictionary = Dictionary(corpus_lists) corpus_bow = [dictionary.doc2bow(doc) for doc in corpus_lists] token_freq_bow = defaultdict(int) for token_id, token_sum in itertools.chain.from_iterable(corpus_bow): token_freq_bow[token_id] += token_sum return pd.DataFrame( list(token_freq_bow.items()), columns=['token_id', 'token_count']).assign( token=lambda df1: df1.apply( lambda df2: dictionary.get(df2.token_id), axis=1), doc_appeared=lambda df1: df1. apply(lambda df2: dictionary.dfs[df2.token_id], axis=1)).reindex( labels=['token_id', 'token', 'token_count', 'doc_appeared'], axis=1).set_index('token_id')
def ner_post_parsing(classification, blogs): tokenized_docs = [word_tokenize(doc) for doc in blogs] dictionary = Dictionary(tokenized_docs) bag_of_words_corpus = [ dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_docs ] total_word_count = defaultdict(int) for word_id, word_count in itertools.chain.from_iterable( bag_of_words_corpus): total_word_count[word_id] += word_count # Create a sorted list from the defaultdict: sorted_word_count sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) # Print the top 1 words across all documents alongside the count for word_id, word_count in sorted_word_count[:1]: print(classification + " is talking about '" + str(dictionary.get(word_id)) + "', with " + str(word_count) + " occurence")
def analyze(originfile, all=False): keywords = helper.getKeywords(originfile) os.chdir('./resources/stanford-corenlp-full-2018-10-05') os.system('kill $(lsof -t -i:9000)') cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &' time.sleep(4) print("starting nlp service") with open(os.devnull, "w") as f: subprocess.call(cmd, shell=True, stderr=f, stdout=f) time.sleep(4) print("nlp service started") os.chdir('../../') nlp_wrapper = StanfordCoreNLP('http://localhost:9000') print("Number of processors: ", mp.cpu_count()) if all: print("all") '''if not os.path.isfile('/resources/all_test.csv'): print("test file created") open('./resources/all_test.csv', 'w').close()''' conn = db.db_connection() dbo = db.db_operator(conn) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] '''for i in range(1790): print('i=' +str(i)) print("limit= 10000") print("offset= "+str(10000*i)) conn.connect() query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \ 'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \ 'FROM masterthesis.reviews, masterthesis.hotels ' \ 'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';' results = [list(x) for x in dbo.execute(query)]; conn.disconnect() print("got results from sql") print("starting analysis") print("tot number rows= " + str(len(results))) try: print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200) pool.close() pool.terminate() pool.join() print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) except TimeoutError: print("timeout error") pool.close() pool.terminate() pool.join() corpus_tok=[] for doc in results: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60) #print('pool close') pool.close() pool.terminate() #print('pool join') pool.join() except TimeoutError: print(str(doc)+" caused Exception") pool.close() pool.terminate() #print('pool join') pool.join() c=[None] corpus_tok.append(c[0]) print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] print('len corpus_tok_reduced= '+str(len(corpus_tok))) corpus_tok_all+=corpus_tok print('len corpus_tok_all= ' + str(len(corpus_tok_all))) if i%100==0 and i!=0: with open('./resources/all_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_all: writer.writerow(c) file.close() corpus_tok_all=[] ''' ''' corpus_tok_all=[] i=0 kk=set() with open('./resources/all_test.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) #if i%10000==0:break ar=((row[0].replace('[','')).replace(']','')).split(',') if ar[1][-1]!="'":#France, Metro. ar[1]=ar[1]+','+ar[2] for j in range(2,len(ar)-1): ar[j]=ar[j+1] del ar[len(ar)-1] ar[1]=ar[1][2:-1] ar[2] = (ar[2].replace("'", '')).replace(' ', '') rev=''.join(ar[3:]) revlist= ar[:3] revlist.append(rev) tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',') r=(revlist,tokens) k=ar[0] if k not in kk: kk.add(k) corpus_tok_all.append(r) file.close() corpus_tok=corpus_tok_all corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) lenc=len(corpus_tok) print("corpus_tok len = "+str(lenc)) for idx in range(lenc): if idx%100000==0: print(idx) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) with open('./resources/corpus_tok_all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerows(corpus_tok) file.close() print("corpus_tok written") from gensim.corpora import Dictionary print("writing frequence file") ''' '''all_set=set() for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if not (keyword == 'cleaning' or keyword=='pet'): start_time = time.time() print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True) # corpus = helper.getCorpusTextFromRaw(raw_corpus) spell = SpellChecker() counter = Value('i', 1) print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get() print('pool close') pool.close() print('pool join') pool.join() print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # print('len all_set_tok before= ' + str(len(all_set))) print('len corpus_tok= ' + str(len(corpus_tok))) print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set))) for sen in corpus_tok: all_set.add((tuple(sen[0]),tuple(sen[1]))) print('len all_set_tok after= ' + str(len(all_set))) print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) # Compute bigrams. if len(all_set) > 0: corpus_tok=[(list(x[0]),list(x[1])) for x in all_set] corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) i = 0 for t in dictionary: i += 1 if i % 1000 == 0: print("analyzing token " + str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent += 1 freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)), alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i] = tuple(list(freq[i]) + [i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: f.write(str(item) + '\n') f.close() print("writing bow file") top_tokens = [f[1] for f in freq[:500]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * ( toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() ''' # Create a dictionary representation of the documents. '''dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) time.sleep(100000) counter = Value('i', 0) pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), ) print("pool initialized") corpustokonly=None alltok=None del corpustokonly, alltok freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get() pool.close() pool.terminate() pool.join() dictionary=None del dictionary global ctonly, dic, alltoks ctonly=None dic=None alltoks=None del ctonly,dic,alltoks print("frequence list len= "+str(len(freq))) print("frequence list created") freq.sort(key=lambda tup: tup[5], reverse=True) print("frequence list sorted") for i in range(len(freq)): if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) freq[i] = tuple(list(freq[i]) + [i]) print("frequence list modified") if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') i=0 ''' '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: i+=1 if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) f.write(str(item) + '\n') f.close()''' corpus_tok=[] i=0 with open('./resources/corpus_tok_all.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) corpus_tok.append(row) file.close() print("len corpus_tok= "+str(len(corpus_tok))) freq=[] i=0 with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i==501:break freq.append(row) file.close() for i in range(len(freq)): freq[i]=freq[i][0] freq[i]=freq[i].replace("'",'') freq[i]=freq[i].replace('"','') freq[i]=freq[i].replace('(','') freq[i]=freq[i].replace(')','') freq[i]=freq[i].replace(' ','') freq[i]=freq[i].split(',') freq[i]=tuple(freq[i]) for i in range(len(corpus_tok)): if i%100000==0: print(i) corpus_tok[i][0]=corpus_tok[i][0].replace('[','') corpus_tok[i][0]=corpus_tok[i][0].replace(']','') det=(corpus_tok[i][0].split(',')) if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe det[1]=' '+'São Tomé and PrÃ\xadncipe'+' ' if det[1][-1]!="'":#France, Metro if 'Ivoire' in det[1]:#Cote d'Ivoire det[1]=det[1].replace('\\','') det[2]=det[2][1:] else: det[1]=det[1]+','+det[2] for j in range(2,len(det)-1): det[j]=det[j+1] del det[len(det)-1] det=det[:3] desc=(corpus_tok[i][0].split(','))[-1] det[0]=det[0][1:-1] det[1]=det[1][2:-1] det[2]=det[2][2:-1] desc=desc[3:-1] det.append(desc) corpus_tok[i][0]=det corpus_tok[i][1]=corpus_tok[i][1].replace("'",'') corpus_tok[i][1]=corpus_tok[i][1].replace(' ','') corpus_tok[i][1]=corpus_tok[i][1].replace('[','') corpus_tok[i][1]=corpus_tok[i][1].replace(']','') corpus_tok[i][1]=corpus_tok[i][1].split(',') print("writing bow file") top_tokens = [f[1] for f in freq[:400]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 print("corpus_tok_len= "+str(len(corpus_tok))) for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if i%100000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 print("len corpus_bow keys= "+str(len(corpus_bow.keys()))) print("got corpus_bow") j=0 print("corpus_bow_len "+str(len(corpus_bow))) with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): j+=1 if j%100000==0: print(j) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) writer.writerow( corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() print("over") else: print("not all") for emotion in ['Good','Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if emotion=='Good' and keyword=='cleaning':#cleaning good start_time = time.time() print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] #if not os.path.isfile('/resources/cleaning_test.csv'): #open('./resources/cleaning_test.csv', 'w').close() for i in range(400):#400 print(str(i)) offset=i*1000 limit=1000 print("starting reading") print("limit="+str(limit)) print("offset="+str(offset)) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset) #corpus = helper.getCorpusTextFromRaw(raw_corpus) #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)] #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):] print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) try: corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30) pool.close() pool.join() except TimeoutError: print("timeout error") print('pool close') pool.close() print('pool terminate') pool.terminate() print('pool join') pool.join() corpus_tok=[] for doc in raw_corpus: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30) #print('pool close') pool.close() #print('pool join') pool.join() '''thread = threading.Thread(target = thread_function_row_only, args = (doc)) thread.start() thread.join() c=que.get()''' except TimeoutError: print(str(doc)+" caused Exception") c=[None] corpus_tok.append(c[0]) corpus_tok_reduced=[r for r in corpus_tok if r != None] print("len corpus_tok: " + str(len(corpus_tok))) print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced))) '''with open('./resources/cleaning_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_reduced: writer.writerow(c) file.close()''' corpus_tok_all+=corpus_tok_reduced print("len corpus_tok_all: " + str(len(corpus_tok_all))) ''' corpus_tok=[] s=0 for doc in corpus: newdoc=False doc = doc.lower() s += 1 if s % 10000 == 0: print(str(s)) for con in constr_conjs: if con in doc: newdoc=True break if not newdoc: toks = [spell.correction(tok['lemma']) for tok in nlp_wrapper.annotate(doc, properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[ 'sentences'][0]['tokens'] if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1] toapp = [] for i in range(len(toks)): if '/' in toks[i]: for tok in toks[i].split('/'): toapp.append(tok) for tok in toapp: toks.append(tok) toapp = [] for i in range(len(toks)): if '-' in toks[i]: for tok in toks[i].split('-'): toapp.append(tok) for tok in toapp: toks.append(tok) corpus_tok.append(toks)''' #print("beginning removal of sents with contrast") corpus_tok=corpus_tok_all print("len corpus_tok: " + str(len(corpus_tok))) ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # # Compute bigrams. if len(corpus_tok)>0: corpustokonly=[r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq=[] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus=len(corpus_tok) print("len dictionary = "+str(len(dictionary.keys()))) i=0 for t in dictionary: i+=1 if i%1000==0: print("analyzing token "+str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent+=1 freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i]=tuple(list(freq[i])+[i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f: for item in freq: f.write(str(item)+'\n') f.close() print("writing bow file") top_tokens=[f[1] for f in freq[:500]] lentoptok=len(top_tokens) corpus_bow={} toplen=0 for i in range(len(corpus_tok)): corpus_bow[i]=[0]*lentoptok if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen: toplen=len(corpus_tok[i][0]+corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)]=1 with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['']*toplen+top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i]) file.close() print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) f.close()
topics_str = '{' + ','.join(topics) + '}' print('@ATTRIBUTE class {}'.format(topics_str)) print('@DATA') for topic in topics: for doc in docs[topic]: total_words = 0 doc_vec = np.zeros(dim_vec) # Calculate the weights tfidf_vector = model[dct.doc2bow(doc.split(' '))] for wid, weight in tfidf_vector: try: word = dct.get(wid) doc_vec += np.array(word2vec.wv[word] * weight) total_words += weight except: pass # Agora é necessário dividir o vetor pela quantidade de palavras para # normalizar os valores doc_vec = doc_vec / total_words # Nesse ponto nós temos o vetor do documento attributes = doc_vec.tolist() attributes.append(topic) print(','.join(str(x) for x in attributes))
def create_dictionary(self): YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None) SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get( "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None) SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None) SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None) SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int( config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000)) if not (YELP_DATASET_DIR and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY and SAVE_DICTIONARY_DIR and SAVE_BAG_OF_WORDS_DIR and SAVE_DICTIONARY_DIR): print( "config keys are not set correctly in the config file: socialconfig.py" ) exit(0) SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR, "Unfiltered") if not os.path.exists( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): raise ("Directory {d} does not exist".format( d=SAVE_REVIEWS_BY_CATEGORY_DIRECTORY)) if not (os.path.exists(SAVE_BAG_OF_WORDS_DIR) and os.path.isdir(SAVE_BAG_OF_WORDS_DIR)): os.makedirs(SAVE_BAG_OF_WORDS_DIR) if not (os.path.exists(SAVE_UNFILTERED_DICTIONARY_DIR) and os.path.isdir(SAVE_UNFILTERED_DICTIONARY_DIR)): os.makedirs(SAVE_UNFILTERED_DICTIONARY_DIR) for pardir, sub_dirs, files in os.walk( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): if len(files) > 0: error_count = 0 review_docs = [] negative_docs = [] positive_docs = [] doc_count = 0 docs_per_file = SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE file_num = str((doc_count / docs_per_file) + 1) for file in files: if "yelp_reviews_" in file and "category" in pardir: reviews = get_reviews_iterable( os.path.join(pardir, file)) yelp_category = pardir.split('/')[-1] CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category) if not (os.path.exists( CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) and os.path .isdir(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)): os.makedirs(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) fname = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category, "{cat}_file_{file_num}.txt".format( cat=yelp_category, file_num=file_num)) bow_file = open(fname, 'w') print( "Writing docs (in bag of words form) for {cat} to directory: {d}" .format(cat=yelp_category, d=os.path.join(SAVE_BAG_OF_WORDS_DIR, yelp_category))) for review in reviews: try: review_dict = ujson.loads(review) except: error_count += 1 pass adjs = review_dict.get("adjectives", None) rating = int(review_dict.get("rating", -1)) if adjs: doc_count += 1 bow_file.write( ujson.dumps(adjs.encode("utf-8")) + "\n") review_docs.append(adjs.strip().split()) if (doc_count % docs_per_file) == 0: if bow_file: bow_file.close() file_num = str((doc_count / docs_per_file) + 1) fname = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category, "{cat}_file_{file_num}.txt".format( cat=yelp_category, file_num=file_num)) bow_file = open(fname, 'w') if rating: if rating > 3: positive_docs.append(adjs.strip().split()) elif rating < 3: negative_docs.append(adjs.strip().split()) else: pass print("Wrote {total} docs in {cat} category".format( total=str(doc_count), cat=yelp_category)) dictionary = Dictionary(review_docs) CATEGORY_SPECIFIC_DICT_DIR = os.path.join( SAVE_UNFILTERED_DICTIONARY_DIR, yelp_category) POSITIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR, "positive") NEGATIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR, "negative") if not (os.path.exists(CATEGORY_SPECIFIC_DICT_DIR) and os.path.isdir(CATEGORY_SPECIFIC_DICT_DIR)): os.makedirs(CATEGORY_SPECIFIC_DICT_DIR) os.makedirs(POSITIVE_SUB_DIR) os.makedirs(NEGATIVE_SUB_DIR) dictionary.save( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_dict.dict".format( yelp_category=yelp_category))) dictionary.save_as_text( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_dict.txt".format( yelp_category=yelp_category))) sorted_doc_freqs = sorted(dictionary.dfs.items(), key=lambda x: x[1], reverse=True) # print("Will save file in:\n " + os.path.join(CATEGORY_SPECIFIC_DICT_DIR,"{yelp_category}_dict.txt".format(yelp_category=yelp_category))) with open( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_words_doc_frequencies.txt".format( yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_doc_freqs: df_file.write( str( dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del dictionary del review_docs del sorted_doc_freqs pos_dictionary = Dictionary(positive_docs) del positive_docs neg_dictionary = Dictionary(negative_docs) del negative_docs pos_dictionary.save( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_dict.dict".format( yelp_category=yelp_category))) pos_dictionary.save_as_text( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_dict.txt".format( yelp_category=yelp_category))) sorted_pos_doc_freqs = sorted(pos_dictionary.dfs.items(), key=lambda x: x[1], reverse=True) with open( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_words_doc_frequencies.txt". format(yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_pos_doc_freqs: df_file.write( str( pos_dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del pos_dictionary del sorted_pos_doc_freqs neg_dictionary.save( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_dict.dict".format( yelp_category=yelp_category))) neg_dictionary.save_as_text( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_dict.txt".format( yelp_category=yelp_category))) sorted_neg_doc_freqs = sorted(neg_dictionary.dfs.items(), key=lambda x: x[1], reverse=True) with open( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_words_doc_frequencies.txt". format(yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_neg_doc_freqs: df_file.write( str( neg_dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del neg_dictionary del sorted_neg_doc_freqs print( "{count} {cat} reviews were discarded because of parsing errors" .format(count=error_count, cat=yelp_category)) print("Created dictionary for {cat} tokens".format( cat=yelp_category))
#creating dictionary and tfidf model tokenized_corpus = RedditCorpus(path) documents = list(itertools.chain(tokenized_corpus)) dct = Dictionary(documents) # fit dictionary corpus = [dct.doc2bow(doc) for doc in documents] # convert dataset to BoW format model = TfidfModel(corpus, id2word=dct) # fit model vector = model[corpus] #write scores to files subjects = [] file_glob = os.path.join(path, '*.txt') subjects.extend(gfile.Glob(file_glob)) for subject in subjects: doc = model[corpus[step]] d = {dct.get(token_id): tfidf for token_id,tfidf in doc} output_file = open(output_path+"\\"+subject.split("\\")[-1],"w") output_file.write(json.dumps(d)) output_file.close() #creating tf-idf matrix tfidf_dense = matutils.corpus2dense(model[corpus], num_terms).T #creating random forrest classifier for extracting important features num_terms = len(model[corpus].obj.idfs) X_train, X_test, y_train, y_test = train_test_split(tfidf_dense, labels, test_size=0.4, random_state=0) clf = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=1) clf.fit(X_train, y_train) importances = clf.feature_importances_ sorted_i = sorted(importances) y_pred = clf.predict(X_test)