def saveweightedtopspersent(originfile): keywords = helper.getKeywords(originfile) for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in keywords.keys(): print(keyword) path = 'resources/gensim/noadj/not_cleaned/' + keyword + '_' + emotion.lower( ) + '/' + keyword + '_' + emotion.lower() try: lda = LdaModel.load(path) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), id_and_country=True, additionaldetails=True) stopwords = getStopwords(stopset) stwfromtfidf = list( TfidfVectorizer(stop_words='english').get_stop_words()) stopwords = set(list(stopwords) + stwfromtfidf) for w in negationstopset: stopwords.add(w) bow, dictionary, corpus, raw_corpus = documentprocessor.fullpreprocessrawcorpustobow( raw_corpus, stopwords, min_count_bigrams=20) if not os.path.exists( 'resources/gensim/noadj/outputtopsdocs/'): os.makedirs('resources/gensim/noadj/outputtopsdocs/') if not os.path.exists( 'resources/gensim/noadj/outputtopsdocs/' + keyword + '_' + emotion.lower() + '/'): os.makedirs('resources/gensim/noadj/outputtopsdocs/' + keyword + '_' + emotion.lower() + '/') csv_file = open('resources/gensim/noadj/outputtopsdocs/' + keyword + '_' + emotion.lower() + '/' + keyword + '_' + emotion.lower() + '.csv', mode='w', encoding="utf8", newline='\n') i = 0 for val in lda.get_document_topics(bow): s = [corpus[i], val] writer = csv.writer(csv_file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(raw_corpus[i] + s) i += 1 csv_file.close() except Exception as e: print(e)
def dividebynation(originfile): keywords = helper.getKeywords(originfile) for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in keywords.keys(): print(keyword) nationcluster = {} try: csv_file = open('resources/gensim/noadj/outputtopsdocs/' + keyword + '_' + emotion.lower() + '/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n') reader = csv.reader(csv_file, delimiter='|', quotechar='"') for row in reader: nat = row[1] if nat not in nationcluster.keys(): nationcluster[nat] = [] nationcluster[nat].append(row) for nat in nationcluster.keys(): if not os.path.exists( 'resources/gensim/noadj/outputtopsdocs/' + keyword + '_' + emotion.lower() + '/bycountry/'): os.makedirs('resources/gensim/noadj/outputtopsdocs/' + keyword + '_' + emotion.lower() + '/bycountry/') csv_file = open('resources/gensim/noadj/outputtopsdocs/' + keyword + '_' + emotion.lower() + '/bycountry/' + nat + '.csv', mode='w', encoding="utf8", newline='\n') writer = csv.writer(csv_file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for r in nationcluster[nat]: writer.writerow(r) csv_file.close() except: None
def do(self, originfile, tf, includingkeword, negation): keywords = helper.getKeywords(originfile) # Create stopword list: stopwords = self.getStopwords(self.stopset) stwfromtfidf = list( TfidfVectorizer(stop_words='english').get_stop_words()) stopwords = set(list(stopwords) + stwfromtfidf) if negation == 'withnegation': for w in self.negationstopset: stopwords.add(w) elif negation == 'nonegation': for w in self.negationstopset: try: stopwords.remove(w) except: None #word already not in stopwords for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): print(keyword) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), id_and_country=True) raw_corpus, corpus = helper.preprocessRawCorpus( raw_corpus, thresholdcountpernation=100) self.doKaggle(corpus, stopwords, keyword, emotion, tfname=tf, includingkeywordname=includingkeword, negationname=negation) # self.doBasicGensim(originfile,corpus) # self.doTWds(originfile,corpus) '''try:
def filterallsep(originfile, toptokens=False, all=False): keywords = helper.getKeywords(originfile) old_cont_index = indexmanager.get_hotel_country_index() old_tok_index = indexmanager.get_token_index() if not all: lkwds = list(keywords.keys()) frequencecell = 9 topnrange = range(10, 51) target_dir = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/topntokens/' + keyword + '/' else: lkwds = ['all'] frequencecell = 7 topnrange = range(100, 101) res_file = 'resources/bow/tourist_hotel_country_freq/diff/all.csv' target_dir = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/' if toptokens: for keyword in lkwds: start_time = time.time() for topn in topnrange: if not all: res_file = './resources/bow/tourist_hotel_country_freq/diff/topntokens/' + keyword + '/' + keyword + '_top_' + str( topn) + '_tokens.csv' target_file = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/topntokens/' + keyword + '/' + keyword + '_top_' + str( topn) + '_tokens.csv' goforward = True lines = [] tokens = set() try: with open(res_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') header = next(csv_reader) for row in csv_reader: lines.append([ row[0], row[2], row[4], row[5], row[frequencecell] ]) tokens.add(row[5]) # if len(origins_to_dect[x])>=5 # print(len([x for x in origins_to_dect.keys()])) csv_file.close() except Exception as e: goforward = False if goforward: tokens = list(tokens) token_index = dict() old_tok_to_new = dict() for i in range(1, len(tokens) + 1): token_index[i] = old_tok_index['index_to_token'][int( tokens[i - 1])] old_tok_to_new[int(tokens[i - 1])] = i if not os.path.exists(target_dir): os.makedirs(target_dir) with open(target_file, mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'country_origin_index', 'country_destination_index', 'number unique reviews', 'token_index', 'frequence_difference' ]) for line in lines: newline = [ int(line[0]), int(line[1]), int(line[2]), old_tok_to_new[int(line[3])], line[4] ] writer.writerow(newline) file.close() if not all: target_file_token_index = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/topntokens/' + keyword + '/' + keyword + '_top_' + str( topn) + '_tokens_token_index.csv' with open(target_file_token_index, mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['token_index', 'token']) for key in token_index.keys(): writer.writerow([key, token_index[key]]) file.close() print('------------------------------------------------------') print( str(time.time() - start_time) + ' seconds to filter ' + keyword + ',top ' + str(topn) + ' tokens') else: for keyword in lkwds: try: lines = [] combs = dict() origins_to_dect = dict() goforward = True start_time = time.time() tokens = set() with open(res_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') header = next(csv_reader) for row in csv_reader: if int(row[4]) >= 0 and row[1] != '' and row[ 1] != 'no_country' and row[3] != 'no_country': lines.append([ row[0], row[2], row[4], row[5], row[frequencecell] ]) if (row[0], row[2]) not in combs.keys(): combs[(row[0], row[2])] = set() if row[0] not in origins_to_dect.keys(): origins_to_dect[row[0]] = set() origins_to_dect[row[0]].add(row[2]) combs[(row[0], row[2])].add(row[5]) tokens.add(row[5]) # if len(origins_to_dect[x])>=5 # print(len([x for x in origins_to_dect.keys()])) csv_file.close() except Exception as e: goforward = False if goforward: countries = set(origins_to_dect.keys()).union( set.union(*[x for x in origins_to_dect.values()])) countries = list(countries) country_index = dict() old_cont_to_new = dict() for i in range(1, len(countries) + 1): country_index[i] = old_cont_index['index_to_country'][int( countries[i - 1])] old_cont_to_new[int(countries[i - 1])] = i tokens = list(tokens) token_index = dict() old_tok_to_new = dict() for i in range(1, len(tokens) + 1): token_index[i] = old_tok_index['index_to_token'][int( tokens[i - 1])] old_tok_to_new[int(tokens[i - 1])] = i if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/' ): os.makedirs( 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/' ) print("starting writing csv") with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/' + keyword + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'country_origin_index', 'country_destination_index', 'number unique reviews', 'token_index', 'frequence_difference' ]) print("len lines= " + str(len(lines))) i = 0 for line in lines: i += 1 if i % 100000 == 0: print(i) newline = [ old_cont_to_new[int(line[0])], old_cont_to_new[int(line[1])], int(line[2]), old_tok_to_new[int(line[3])], line[4] ] writer.writerow(newline) file.close() print("starting writing country index") with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/' + keyword + '_country_index.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['country_index', 'country']) for key in country_index.keys(): writer.writerow([key, country_index[key]]) file.close() print("starting writing token index") with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/' + keyword + '_token_index.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['token_index', 'token']) for key in token_index.keys(): writer.writerow([key, token_index[key]]) file.close() print('------------------------------------------------------') print( str(time.time() - start_time) + ' seconds to filter ' + keyword)
def build_association_count_list(originfile): lines = [] lines_reduced = [] ass = dict() keywords = helper.getKeywords(originfile) combs = dict() ass_reduced = dict() combs_reduced = dict() for keyword in list(keywords.keys()): if keyword in ['breakfast', 'bedroom', 'bathroom', 'location']: ass[keyword] = dict() combs[keyword] = dict() ass_reduced[keyword] = dict() combs_reduced[keyword] = dict() with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/' + keyword + '.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') next(csv_reader) for row in csv_reader: if row[0] not in ass[keyword].keys(): ass[keyword][row[0]] = set() ass[keyword][row[0]].add(row[1]) if (row[0], row[1]) not in combs[keyword].keys(): combs[keyword][(row[0], row[1])] = set() combs[keyword][(row[0], row[1])].add(row[2]) csv_file.close() with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/' + keyword + '.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') next(csv_reader) for row in csv_reader: if row[0] not in ass_reduced[keyword].keys(): ass_reduced[keyword][row[0]] = set() ass_reduced[keyword][row[0]].add(row[1]) if (row[0], row[1]) not in combs_reduced[keyword].keys(): combs_reduced[keyword][(row[0], row[1])] = set() combs_reduced[keyword][(row[0], row[1])].add(row[2]) csv_file.close() b = True v = set([[k for k in ass[keyword].keys()] for keyword in ass.keys()][0]) for o in [[k for k in ass[keyword].keys()] for keyword in ass.keys()]: if set(o) != v: b = False break lines.append('all possible origins are the same number and the same = ' + str(b) + '\n') v = [[ass[keyword][k] for k in ass[keyword].keys()] for keyword in ass.keys()][0][0] b = True for d in [[ass[keyword][k] for k in ass[keyword].keys()] for keyword in ass.keys()]: for dd in d: if dd != v: b = False break lines.append( 'all possible destinations are the same number and the same = ' + str(b) + '\n') lines.append('all origins are: ' + str(set([k for k in ass['breakfast'].keys()])) + '\n') lines.append('all destinations are: ' + str([ass['breakfast'][k] for k in ass['breakfast'].keys()][0]) + '\n') for keyword in ['breakfast', 'bedroom', 'bathroom', 'location']: b = True toksetz = [combs[keyword][c] for c in combs[keyword].keys()][0] for tokset in [combs[keyword][c] for c in combs[keyword].keys()]: if tokset != toksetz: b = False break lines.append( "for concept " + keyword + ', for every combination origin/destination, all the tokens are the same = ' + str(b) + '\n') lines.append( "for concept " + keyword + ' the length of the list of tokens for the first combination origin/destination is ' + str(len(toksetz)) + '\n') b = True v = set([[k for k in ass_reduced[keyword].keys()] for keyword in ass_reduced.keys()][0]) for o in [[k for k in ass_reduced[keyword].keys()] for keyword in ass_reduced.keys()]: if set(o) != v: b = False break lines_reduced.append( 'all possible origins are the same number and the same = ' + str(b) + '\n') v = [[ass_reduced[keyword][k] for k in ass_reduced[keyword].keys()] for keyword in ass_reduced.keys()][0][0] b = True for d in [[ass_reduced[keyword][k] for k in ass_reduced[keyword].keys()] for keyword in ass_reduced.keys()]: for dd in d: if dd != v: b = False break lines_reduced.append( 'all possible destinations are the same number and the same = ' + str(b) + '\n') lines_reduced.append('all origins are: ' + str(set([k for k in ass_reduced['breakfast'].keys()])) + '\n') lines_reduced.append('all destinations are: ' + str( [ass_reduced['breakfast'][k] for k in ass_reduced['breakfast'].keys()][0]) + '\n') for keyword in ['breakfast', 'bedroom', 'bathroom', 'location']: b = True toksetz = [ combs_reduced[keyword][c] for c in combs_reduced[keyword].keys() ][0] for tokset in [ combs_reduced[keyword][c] for c in combs_reduced[keyword].keys() ]: if tokset != toksetz: b = False break lines_reduced.append( "for concept " + keyword + ', for every combination origin/destination, all the tokens are the same = ' + str(b) + '\n') lines_reduced.append( "for concept " + keyword + ' the length of the list of tokens for the first combination origin/destination is ' + str(len(toksetz)) + '\n') file = open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/report.txt', 'w') file.writelines(lines) file.close() file = open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/report.txt', 'w') file.writelines(lines_reduced) file.close()
def filter(originfile): keywords = helper.getKeywords(originfile) countries = dict() tokens = dict() countries['origin'] = dict() countries['destination'] = dict() lines_dict = dict() lines_reduced_dict = dict() intersect_tokens = set() intersect_countries_origin = set() intersect_countries_dest = set() validkeywords = [] ass_count_count = dict() ass_count_count['origin_tourist'] = dict() ass_count_count['destination_hotel'] = dict() k_values = dict() k_values['breakfast'] = 6 k_values['bedroom'] = 5 k_values['bathroom'] = 4 k_values['location'] = 13 for keyword in list(keywords.keys()): if keyword in ['breakfast', 'bedroom', 'bathroom', 'location']: ass_count_count['origin_tourist'][keyword] = dict() ass_count_count['destination_hotel'][keyword] = dict() countries['origin'][keyword] = set() countries['destination'][keyword] = set() tokens[keyword] = set() start_time = time.time() goforward = True print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) lines = [] lines_reduced = [] try: with open('resources/bow/tourist_hotel_country_freq/diff/' + keyword + '.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') next(csv_reader) for row in csv_reader: if int(row[4]) >= 100 and row[1] != '' and row[ 1] != 'no_country' and row[3] != 'no_country': lines.append([row[0], row[2], row[5], row[9]]) countries['origin'][keyword].add(row[0]) countries['destination'][keyword].add(row[2]) tokens[keyword].add(row[5]) if int(row[4]) >= 20 and row[1] != '' and row[ 1] != 'no_country' and row[3] != 'no_country': lines_reduced.append( [row[0], row[2], row[5], row[9]]) csv_file.close() except Exception as e: goforward = False if goforward: validkeywords.append(keyword) lines_dict[keyword] = lines lines_reduced_dict[keyword] = lines_reduced if len(list(intersect_tokens)) == 0: intersect_tokens = tokens[keyword] if len(list(intersect_countries_origin)) == 0: intersect_countries_origin = countries['origin'][keyword] if len(list(intersect_countries_dest)) == 0: intersect_countries_dest = countries['destination'][ keyword] intersect_tokens = intersect_tokens.intersection( tokens[keyword]) intersect_countries_origin = intersect_countries_origin.intersection( countries['origin'][keyword]) intersect_countries_dest = intersect_countries_dest.intersection( countries['destination'][keyword]) ass_sep = dict() for line in lines: if line[0] not in ass_sep.keys(): ass_sep[line[0]] = set() ass_sep[line[0]].add(line[1]) k = k_values[keyword] destinations_sep = set.intersection(*[ ass_sep[key] for key in ass_sep.keys() if len(ass_sep[key]) >= k ]) origins_sep = set([ key for key in ass_sep.keys() if ass_sep[key] >= (destinations_sep) ]) newdestinations_sep = set.intersection( *[ass_sep[k] for k in origins_sep]) if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/concept_separetely/' ): os.makedirs( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/concept_separetely/' ) with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/concept_separetely/' + keyword + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'country_origin_index', 'country_destination_index', 'token_index', 'frequence_difference' ]) for line in lines_dict[keyword]: if line[0] in origins_sep and line[ 1] in newdestinations_sep: writer.writerow(line) file.close() print('------------------------------------------------------') print( str(time.time() - start_time) + ' seconds to filter ' + keyword) if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/filtered/'): os.makedirs('resources/bow/tourist_hotel_country_freq/diff/filtered/') if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/' ): os.makedirs( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/') if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/' ): os.makedirs( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/' ) lines = [[line for line in lines_dict[keyword]] for keyword in validkeywords] lines_reduced = [[line for line in lines_reduced_dict[keyword]] for keyword in validkeywords] ass = dict() ass_reduced = dict() tokens = set() for line in lines: ass[lines.index(line)] = dict() for l in line: if l[0] not in ass[lines.index(line)].keys(): ass[lines.index(line)][l[0]] = set() ass[lines.index(line)][l[0]].add(l[1]) tokens_reduced = set() for line in lines_reduced: ass_reduced[lines_reduced.index(line)] = dict() for l in line: if l[0] not in ass_reduced[lines_reduced.index(line)].keys(): ass_reduced[lines_reduced.index(line)][l[0]] = set() ass_reduced[lines_reduced.index(line)][l[0]].add(l[1]) k = 7 destinations = set.intersection(*[ set.intersection(*[ ass[keyword][key] for key in ass[keyword].keys() if len(ass[keyword][key]) >= k ]) for keyword in ass.keys() ]) origins = set.intersection(*[ set([ key for key in ass[keyword].keys() if ass[keyword][key] >= (destinations) ]) for keyword in ass.keys() ]) newdestinations = set.intersection(*[ set.intersection(*[ass[keyword][k] for k in origins]) for keyword in ass.keys() ]) for keyword in lines_dict.keys(): for line in lines_dict[keyword]: if line[0] in origins and line[1] in newdestinations: tokens.add(line[2]) k = 12 destinations_reduced = set.intersection(*[ set.intersection(*[ ass_reduced[keyword][key] for key in ass_reduced[keyword].keys() if len(ass_reduced[keyword][key]) >= k ]) for keyword in ass_reduced.keys() ]) origins_reduced = set.intersection(*[ set([ key for key in ass_reduced[keyword].keys() if ass_reduced[keyword][key] >= (destinations_reduced) ]) for keyword in ass_reduced.keys() ]) newdestinations_reduced = set.intersection(*[ set.intersection(*[ass_reduced[keyword][k] for k in origins_reduced]) for keyword in ass.keys() ]) for keyword in lines_reduced_dict.keys(): for line in lines_reduced_dict[keyword]: if line[0] in origins_reduced and line[ 1] in newdestinations_reduced: tokens_reduced.add(line[2]) token_index = dict() country_index = dict() token_index_reduced = dict() country_index_reduced = dict() old_cont_index = indexmanager.get_hotel_country_index() old_tok_index = indexmanager.get_token_index() country_list = list(newdestinations.union(origins)) old_cont_to_new = dict() old_tok_to_new = dict() old_cont_to_new_reduced = dict() old_tok_to_new_reduced = dict() tokenlist = list(tokens) tokenlist_reduced = list(tokens_reduced) country_list_reduced = list(newdestinations_reduced.union(origins_reduced)) for i in range(1, len(country_list) + 1): country_index[i] = old_cont_index['index_to_country'][int( country_list[i - 1])] old_cont_to_new[int(country_list[i - 1])] = i for i in range(1, len(tokenlist) + 1): token_index[i] = old_tok_index['index_to_token'][int(tokenlist[i - 1])] old_tok_to_new[int(tokenlist[i - 1])] = i for i in range(1, len(country_list_reduced) + 1): country_index_reduced[i] = old_cont_index['index_to_country'][int( country_list_reduced[i - 1])] old_cont_to_new_reduced[int(country_list_reduced[i - 1])] = i for i in range(1, len(tokenlist) + 1): token_index_reduced[i] = old_tok_index['index_to_token'][int( tokenlist_reduced[i - 1])] old_tok_to_new_reduced[int(tokenlist_reduced[i - 1])] = i with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/token_index.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for k in sorted(list(token_index.keys())): writer.writerow([k, token_index[k]]) file.close() with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/country_index.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for k in sorted(list(country_index.keys())): writer.writerow([k, country_index[k]]) file.close() for keyword in validkeywords: with open('resources/bow/tourist_hotel_country_freq/diff/filtered/' + keyword + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'country_origin_index', 'country_destination_index', 'token_index', 'frequence_difference' ]) for line in lines_dict[keyword]: if line[0] in intersect_countries_origin and line[ 1] in intersect_countries_dest: writer.writerow(line) file.close() with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/' + keyword + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'country_origin_index', 'country_destination_index', 'token_index', 'frequence_difference' ]) for line in lines_dict[keyword]: if line[0] in origins and line[1] in newdestinations: newline = [ old_cont_to_new[int(line[0])], old_cont_to_new[int(line[1])], old_tok_to_new[int(line[2])], line[3] ] writer.writerow(newline) with open( 'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/' + keyword + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'country_origin_index', 'country_destination_index', 'token_index', 'frequence_difference' ]) for line in lines_reduced_dict[keyword]: if line[0] in origins_reduced and line[ 1] in newdestinations_reduced: writer.writerow(line) file.close()
def do(originfile, all=False, common_tokens=True): if all: tokenset = set() alltable = read_table_all( 'resources/bow/tourist_hotel_country_freq/all.csv') diff_table = {} for countries in alltable.keys(): diff_table[countries] = {} diff_table[countries]['tokens'] = {} diff_table[countries]['unique_reviews'] = alltable[countries][ 'unique_reviews'] diff_table[countries]['count_rev'] = len( list(diff_table[countries]['unique_reviews'])) for tok in alltable[countries]['tokens'].keys(): tokenset.add(tok) diff_table[countries]['tokens'][tok] = {} diff_table[countries]['tokens'][tok]['diff'] = alltable[ countries]['tokens'][tok] indexmanager.update_token_index(tokenset) print("start writing difference matrix for all matrix") country_ind = indexmanager.get_hotel_country_index() if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/'): os.makedirs('resources/bow/tourist_hotel_country_freq/diff/') with open('resources/bow/tourist_hotel_country_freq/diff/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'Tourist_Country_Index', 'Tourist_Country', 'Hotel_Country_Index', 'Hotel_Country', 'Total number of unique reviews', 'Token_Index', 'Token', 'Token_Frequence' ]) token_index = indexmanager.get_token_index() print("num_comb_countries= " + str(len(diff_table.keys()))) i = 0 for countries in diff_table.keys(): i += 1 if i % 1000 == 0: print( str(i) + ' ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))) for tok in diff_table[countries]['tokens'].keys(): writer.writerow([ country_ind['country_to_index'][countries[0]], countries[0], country_ind['country_to_index'][countries[1]], countries[1], diff_table[countries]['count_rev'], token_index['token_to_index'][tok], tok, "{:.15f}".format( diff_table[countries]['tokens'][tok]['diff']) ]) file.close() print("over. written difference file") else: keywords = helper.getKeywords(originfile) tokenset = set() diff_tables = {} diff_tables_topntokens = {} validkeywords = [] for keyword in keywords.keys(): start_time = time.time() goforward = True print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) try: good_tab = read_table( 'resources/bow/tourist_hotel_country_freq/' + keyword + '_good.csv') bad_table = read_table( 'resources/bow/tourist_hotel_country_freq/' + keyword + '_bad.csv') except: goforward = False if goforward: validkeywords.append(keyword) #diff_table = get_diff_table(good_tab, bad_table, tokenset, common_tokens=common_tokens) diff_tables_topntokens[keyword] = {} #diff_tables[keyword] = diff_table for topntokens in range(10, 51): diff_table_topntokens = get_diff_table( good_tab, bad_table, tokenset, common_tokens=True, topntokens=topntokens) diff_tables_topntokens[keyword][ topntokens] = diff_table_topntokens print('------------------------------------------------------') print( str(time.time() - start_time) + ' seconds to build the difference table for ' + keyword) print("start writing difference matrices") #indexmanager.build_token_index(tokenset) token_index = indexmanager.get_token_index() if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/topntokens/'): os.makedirs( 'resources/bow/tourist_hotel_country_freq/diff/topntokens/') for keyword in validkeywords: start_time = time.time() print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) #country_tourist_ind = indexmanager.get_tourist_country_index() country_ind = indexmanager.get_hotel_country_index() if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/'): os.makedirs('resources/bow/tourist_hotel_country_freq/diff/') if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/diff/topntokens/' + keyword + '/'): os.makedirs( 'resources/bow/tourist_hotel_country_freq/diff/topntokens/' + keyword + '/') '''with open('resources/bow/tourist_hotel_country_freq/diff/' + keyword + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['Tourist_Country_Index', 'Tourist_Country', 'Hotel_Country_Index', 'Hotel_Country', 'Total number of unique reviews', 'Token_Index', 'Token', 'Token_Frequence_in_Good', 'Token_Frequence_in_Bad', 'Difference']) for countries in diff_tables[keyword].keys(): for tok in diff_tables[keyword][countries]['tokens'].keys(): goodval=diff_tables[keyword][countries]['tokens'][tok]['good'] if goodval!='N/A': goodval="{:.15f}".format(goodval) badval = diff_tables[keyword][countries]['tokens'][tok]['bad'] if badval != 'N/A': badval = "{:.15f}".format(badval) writer.writerow([country_ind['country_to_index'][countries[0]], countries[0], country_ind['country_to_index'][countries[1]], countries[1], diff_tables[keyword][countries]['count_rev'], token_index['token_to_index'][tok], tok, goodval, badval, "{:.15f}".format(diff_tables[keyword][countries]['tokens'][tok]['diff'])]) file.close()''' for topntokens in diff_tables_topntokens[keyword].keys(): with open( 'resources/bow/tourist_hotel_country_freq/diff/topntokens/' + keyword + '/' + keyword + '_top_' + str(topntokens) + '_tokens.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ 'Tourist_Country_Index', 'Tourist_Country', 'Hotel_Country_Index', 'Hotel_Country', 'Total number of unique reviews', 'Token_Index', 'Token', 'Token_Frequence_in_Good', 'Token_Frequence_in_Bad', 'Difference' ]) for countries in diff_tables_topntokens[keyword][ topntokens].keys(): for tok in diff_tables_topntokens[keyword][topntokens][ countries]['tokens'].keys(): goodval = diff_tables_topntokens[keyword][ topntokens][countries]['tokens'][tok]['good'] if goodval != 'N/A': goodval = "{:.15f}".format(goodval) badval = diff_tables_topntokens[keyword][ topntokens][countries]['tokens'][tok]['bad'] if badval != 'N/A': badval = "{:.15f}".format(badval) writer.writerow([ country_ind['country_to_index'][countries[0]], countries[0], country_ind['country_to_index'][countries[1]], countries[1], diff_tables_topntokens[keyword] [topntokens][countries]['count_rev'], token_index['token_to_index'][tok], tok, goodval, badval, "{:.15f}".format( diff_tables_topntokens[keyword][topntokens] [countries]['tokens'][tok]['diff']) ]) file.close() print( str(time.time() - start_time) + ' seconds to write the difference matrix for ' + keyword)
import csv import os import time import db import helper conn = db.db_connection() conn.connect() dbo = db.db_operator(conn) keywords = helper.getKeywords('booking_keywords.txt') diff_tables = {} validkeywords = [] cd = os.getcwd() ''' query = 'CREATE TABLE masterthesis.' + keyword + '_diff_filtered_intersection_only ' + \ '(Country_of_origin VARCHAR(45) NOT NULL, Country_of_destination VARCHAR(45) NOT NULL, ' \ 'Token_index SMALLINT NOT NULL, Frequence_difference VARCHAR(45),' \ ' PRIMARY KEY (Country_of_origin, Country_of_destination,Token_index));' dbo.execute(query) #with open('resources/bow/tourist_hotel_country_freq/diff/filtered' + keyword + '_'+emotion+'.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') firstrow = next(csv_reader) csv_file.close() firstrow=firstrow[3:]''' '''query='CREATE TABLE masterthesis.'+keyword+'_diff_filtered_intersection_only '+ \ '(Country_of_origin VARCHAR(45) NOT NULL, Country_of_destination VARCHAR(45) NOT NULL, ' \ 'Token_index SMALLINT NOT NULL, Frequence_difference VARCHAR(45),' \ ' PRIMARY KEY (Country_of_origin, Country_of_destination,Token_index));' ''' '''for field in firstrow:
def do(originfile): keywords = helper.getKeywords(originfile) for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in keywords.keys(): start_time = time.time() print(keyword) raw_corpus = helper.getRawCorpus(csv_file=open( 'resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), id_and_country=True) print("starting preprocessing") stopwords = getStopwords(stopset) stwfromtfidf = list( TfidfVectorizer(stop_words='english').get_stop_words()) stopwords = set(list(stopwords) + stwfromtfidf) for w in negationstopset: stopwords.add(w) bow, dictionary, corpus, raw_corpus = documentprocessor.fullpreprocessrawcorpustobow( raw_corpus, stopwords, min_count_bigrams=20) ############################################################################### # Let's see how many tokens and documents we have to train on. # print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(bow)) ############################################################################### # Training # -------- # # We are ready to train the LDA model. We will first discuss how to set some of # the training parameters. # # First of all, the elephant in the room: how many topics do I need? There is # really no easy answer for this, it will depend on both your data and your # application. I have used 10 topics here because I wanted to have a few topics # that I could interpret and "label", and because that turned out to give me # reasonably good results. You might not need to interpret all your topics, so # you could use a large number of topics, for example 100. # # ``chunksize`` controls how many documents are processed at a time in the # training algorithm. Increasing chunksize will speed up training, at least as # long as the chunk of documents easily fit into memory. I've set ``chunksize = # 2000``, which is more than the amount of documents, so I process all the # data in one go. Chunksize can however influence the quality of the model, as # discussed in Hoffman and co-authors [2], but the difference was not # substantial in this case. # # ``passes`` controls how often we train the model on the entire corpus. # Another word for passes might be "epochs". ``iterations`` is somewhat # technical, but essentially it controls how often we repeat a particular loop # over each document. It is important to set the number of "passes" and # "iterations" high enough. # # I suggest the following way to choose iterations and passes. First, enable # logging (as described in many Gensim tutorials), and set ``eval_every = 1`` # in ``LdaModel``. When training the model look for a line in the log that # looks something like this:: # # 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations # # If you set ``passes = 20`` you will see this line 20 times. Make sure that by # the final passes, most of the documents have converged. So you want to choose # both passes and iterations to be high enough for this to happen. # # We set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat # technical, but essentially we are automatically learning two parameters in # the model that we usually would have to specify explicitly. # # Train LDA model. from gensim.models import LdaModel bestacc = -1 bestmodel = None if len(bow) > 0: print( "starting training and checking with different number of topics" ) for numt in range(2, 21): # Set training parameters. num_topics = numt chunksize = 2000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[ 0] # This is only to "load" the dictionary. id2word = dictionary.id2token model = LdaModel(corpus=bow, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) ############################################################################### # We can compute the topic coherence of each topic. Below we display the # average topic coherence and print the topics in order of topic coherence. # # Note that we use the "Umass" topic coherence measure here (see # :py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently # obtained an implementation of the "AKSW" topic coherence measure (see # accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/). # # If you are familiar with the subject of the articles in this dataset, you can # see that the topics below make a lot of sense. However, they are not without # flaws. We can see that there is substantial overlap between some topics, # others are hard to interpret, and most of them have at least some terms that # seem out of place. If you were able to do better, feel free to share your # methods on the blog at http://rare-technologies.com/lda-training-tips/ ! # top_topics = model.top_topics(bow) # , num_words=20) acc = computetopacc(top_topics) if acc > bestacc: print("found better model with number of topics: " + str(model.num_topics)) bestacc = acc bestmodel = copy.deepcopy(model) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics ]) / num_topics cc.append(avg_topic_coherence) print('Average topic coherence: %.4f.' % avg_topic_coherence) savemodel(bestmodel, keyword, emotion, bow) print( str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
def do(originfile): keywords = helper.getKeywords(originfile) for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): start_time = time.time() goforward = True print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) try: with open('resources/bow/' + keyword + '_' + emotion.lower() + '.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') tokens, country_cluster_hotel, country_cluster_tourist = cluster( csv_reader) csv_file.close() except: goforward = False if goforward: if not os.path.exists( 'resources/bow/country_freq/byhotelcountry/'): os.makedirs('resources/bow/country_freq/byhotelcountry/') if not os.path.exists( 'resources/bow/country_freq/bytouristcountry/'): os.makedirs('resources/bow/country_freq/bytouristcountry/') with open('resources/bow/country_freq/byhotelcountry/' + keyword + '_' + emotion.lower() + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * 2 + tokens) for country in country_cluster_hotel.keys(): writer.writerow( [country] + [country_cluster_hotel[country]['count_rev']] + list( map("{:.15f}".format, country_cluster_hotel[country] ['rel_freq']))) file.close() with open('resources/bow/country_freq/bytouristcountry/' + keyword + '_' + emotion.lower() + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * 2 + tokens) for country in country_cluster_tourist.keys(): writer.writerow([country] + [ country_cluster_tourist[country]['count_rev'] ] + list( map("{:.15f}".format, country_cluster_tourist[country]['rel_freq']))) file.close() print('------------------------------------------------------') print( str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
def analyze(originfile, all=False): keywords = helper.getKeywords(originfile) os.chdir('./resources/stanford-corenlp-full-2018-10-05') os.system('kill $(lsof -t -i:9000)') cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &' time.sleep(4) print("starting nlp service") with open(os.devnull, "w") as f: subprocess.call(cmd, shell=True, stderr=f, stdout=f) time.sleep(4) print("nlp service started") os.chdir('../../') nlp_wrapper = StanfordCoreNLP('http://localhost:9000') print("Number of processors: ", mp.cpu_count()) if all: print("all") '''if not os.path.isfile('/resources/all_test.csv'): print("test file created") open('./resources/all_test.csv', 'w').close()''' conn = db.db_connection() dbo = db.db_operator(conn) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] '''for i in range(1790): print('i=' +str(i)) print("limit= 10000") print("offset= "+str(10000*i)) conn.connect() query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \ 'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \ 'FROM masterthesis.reviews, masterthesis.hotels ' \ 'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';' results = [list(x) for x in dbo.execute(query)]; conn.disconnect() print("got results from sql") print("starting analysis") print("tot number rows= " + str(len(results))) try: print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200) pool.close() pool.terminate() pool.join() print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) except TimeoutError: print("timeout error") pool.close() pool.terminate() pool.join() corpus_tok=[] for doc in results: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60) #print('pool close') pool.close() pool.terminate() #print('pool join') pool.join() except TimeoutError: print(str(doc)+" caused Exception") pool.close() pool.terminate() #print('pool join') pool.join() c=[None] corpus_tok.append(c[0]) print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] print('len corpus_tok_reduced= '+str(len(corpus_tok))) corpus_tok_all+=corpus_tok print('len corpus_tok_all= ' + str(len(corpus_tok_all))) if i%100==0 and i!=0: with open('./resources/all_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_all: writer.writerow(c) file.close() corpus_tok_all=[] ''' ''' corpus_tok_all=[] i=0 kk=set() with open('./resources/all_test.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) #if i%10000==0:break ar=((row[0].replace('[','')).replace(']','')).split(',') if ar[1][-1]!="'":#France, Metro. ar[1]=ar[1]+','+ar[2] for j in range(2,len(ar)-1): ar[j]=ar[j+1] del ar[len(ar)-1] ar[1]=ar[1][2:-1] ar[2] = (ar[2].replace("'", '')).replace(' ', '') rev=''.join(ar[3:]) revlist= ar[:3] revlist.append(rev) tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',') r=(revlist,tokens) k=ar[0] if k not in kk: kk.add(k) corpus_tok_all.append(r) file.close() corpus_tok=corpus_tok_all corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) lenc=len(corpus_tok) print("corpus_tok len = "+str(lenc)) for idx in range(lenc): if idx%100000==0: print(idx) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) with open('./resources/corpus_tok_all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerows(corpus_tok) file.close() print("corpus_tok written") from gensim.corpora import Dictionary print("writing frequence file") ''' '''all_set=set() for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if not (keyword == 'cleaning' or keyword=='pet'): start_time = time.time() print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True) # corpus = helper.getCorpusTextFromRaw(raw_corpus) spell = SpellChecker() counter = Value('i', 1) print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get() print('pool close') pool.close() print('pool join') pool.join() print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # print('len all_set_tok before= ' + str(len(all_set))) print('len corpus_tok= ' + str(len(corpus_tok))) print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set))) for sen in corpus_tok: all_set.add((tuple(sen[0]),tuple(sen[1]))) print('len all_set_tok after= ' + str(len(all_set))) print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) # Compute bigrams. if len(all_set) > 0: corpus_tok=[(list(x[0]),list(x[1])) for x in all_set] corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) i = 0 for t in dictionary: i += 1 if i % 1000 == 0: print("analyzing token " + str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent += 1 freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)), alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i] = tuple(list(freq[i]) + [i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: f.write(str(item) + '\n') f.close() print("writing bow file") top_tokens = [f[1] for f in freq[:500]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * ( toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() ''' # Create a dictionary representation of the documents. '''dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) time.sleep(100000) counter = Value('i', 0) pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), ) print("pool initialized") corpustokonly=None alltok=None del corpustokonly, alltok freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get() pool.close() pool.terminate() pool.join() dictionary=None del dictionary global ctonly, dic, alltoks ctonly=None dic=None alltoks=None del ctonly,dic,alltoks print("frequence list len= "+str(len(freq))) print("frequence list created") freq.sort(key=lambda tup: tup[5], reverse=True) print("frequence list sorted") for i in range(len(freq)): if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) freq[i] = tuple(list(freq[i]) + [i]) print("frequence list modified") if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') i=0 ''' '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: i+=1 if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) f.write(str(item) + '\n') f.close()''' corpus_tok=[] i=0 with open('./resources/corpus_tok_all.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) corpus_tok.append(row) file.close() print("len corpus_tok= "+str(len(corpus_tok))) freq=[] i=0 with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i==501:break freq.append(row) file.close() for i in range(len(freq)): freq[i]=freq[i][0] freq[i]=freq[i].replace("'",'') freq[i]=freq[i].replace('"','') freq[i]=freq[i].replace('(','') freq[i]=freq[i].replace(')','') freq[i]=freq[i].replace(' ','') freq[i]=freq[i].split(',') freq[i]=tuple(freq[i]) for i in range(len(corpus_tok)): if i%100000==0: print(i) corpus_tok[i][0]=corpus_tok[i][0].replace('[','') corpus_tok[i][0]=corpus_tok[i][0].replace(']','') det=(corpus_tok[i][0].split(',')) if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe det[1]=' '+'São Tomé and PrÃ\xadncipe'+' ' if det[1][-1]!="'":#France, Metro if 'Ivoire' in det[1]:#Cote d'Ivoire det[1]=det[1].replace('\\','') det[2]=det[2][1:] else: det[1]=det[1]+','+det[2] for j in range(2,len(det)-1): det[j]=det[j+1] del det[len(det)-1] det=det[:3] desc=(corpus_tok[i][0].split(','))[-1] det[0]=det[0][1:-1] det[1]=det[1][2:-1] det[2]=det[2][2:-1] desc=desc[3:-1] det.append(desc) corpus_tok[i][0]=det corpus_tok[i][1]=corpus_tok[i][1].replace("'",'') corpus_tok[i][1]=corpus_tok[i][1].replace(' ','') corpus_tok[i][1]=corpus_tok[i][1].replace('[','') corpus_tok[i][1]=corpus_tok[i][1].replace(']','') corpus_tok[i][1]=corpus_tok[i][1].split(',') print("writing bow file") top_tokens = [f[1] for f in freq[:400]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 print("corpus_tok_len= "+str(len(corpus_tok))) for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if i%100000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 print("len corpus_bow keys= "+str(len(corpus_bow.keys()))) print("got corpus_bow") j=0 print("corpus_bow_len "+str(len(corpus_bow))) with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): j+=1 if j%100000==0: print(j) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) writer.writerow( corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() print("over") else: print("not all") for emotion in ['Good','Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if emotion=='Good' and keyword=='cleaning':#cleaning good start_time = time.time() print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] #if not os.path.isfile('/resources/cleaning_test.csv'): #open('./resources/cleaning_test.csv', 'w').close() for i in range(400):#400 print(str(i)) offset=i*1000 limit=1000 print("starting reading") print("limit="+str(limit)) print("offset="+str(offset)) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset) #corpus = helper.getCorpusTextFromRaw(raw_corpus) #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)] #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):] print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) try: corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30) pool.close() pool.join() except TimeoutError: print("timeout error") print('pool close') pool.close() print('pool terminate') pool.terminate() print('pool join') pool.join() corpus_tok=[] for doc in raw_corpus: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30) #print('pool close') pool.close() #print('pool join') pool.join() '''thread = threading.Thread(target = thread_function_row_only, args = (doc)) thread.start() thread.join() c=que.get()''' except TimeoutError: print(str(doc)+" caused Exception") c=[None] corpus_tok.append(c[0]) corpus_tok_reduced=[r for r in corpus_tok if r != None] print("len corpus_tok: " + str(len(corpus_tok))) print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced))) '''with open('./resources/cleaning_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_reduced: writer.writerow(c) file.close()''' corpus_tok_all+=corpus_tok_reduced print("len corpus_tok_all: " + str(len(corpus_tok_all))) ''' corpus_tok=[] s=0 for doc in corpus: newdoc=False doc = doc.lower() s += 1 if s % 10000 == 0: print(str(s)) for con in constr_conjs: if con in doc: newdoc=True break if not newdoc: toks = [spell.correction(tok['lemma']) for tok in nlp_wrapper.annotate(doc, properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[ 'sentences'][0]['tokens'] if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1] toapp = [] for i in range(len(toks)): if '/' in toks[i]: for tok in toks[i].split('/'): toapp.append(tok) for tok in toapp: toks.append(tok) toapp = [] for i in range(len(toks)): if '-' in toks[i]: for tok in toks[i].split('-'): toapp.append(tok) for tok in toapp: toks.append(tok) corpus_tok.append(toks)''' #print("beginning removal of sents with contrast") corpus_tok=corpus_tok_all print("len corpus_tok: " + str(len(corpus_tok))) ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # # Compute bigrams. if len(corpus_tok)>0: corpustokonly=[r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq=[] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus=len(corpus_tok) print("len dictionary = "+str(len(dictionary.keys()))) i=0 for t in dictionary: i+=1 if i%1000==0: print("analyzing token "+str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent+=1 freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i]=tuple(list(freq[i])+[i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f: for item in freq: f.write(str(item)+'\n') f.close() print("writing bow file") top_tokens=[f[1] for f in freq[:500]] lentoptok=len(top_tokens) corpus_bow={} toplen=0 for i in range(len(corpus_tok)): corpus_bow[i]=[0]*lentoptok if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen: toplen=len(corpus_tok[i][0]+corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)]=1 with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['']*toplen+top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i]) file.close() print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) f.close()
def do(originfile, all=False): if all: start_time = time.time() print('all ----- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) with open('resources/bow/all.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') tokens, cluster_tourist_hotel = cluster_all(csv_reader) csv_file.close() if not os.path.exists('resources/bow/tourist_hotel_country_freq/'): os.makedirs('resources/bow/tourist_hotel_country_freq/') print("got cluster of all, start writing") with open('resources/bow/tourist_hotel_country_freq/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * 2 + ['unique IDs'] + tokens) del tokens for country in cluster_tourist_hotel.keys(): writer.writerow( [country[0], country[1]] + [cluster_tourist_hotel[country]['count_rev']] + list( map("{:.15f}".format, cluster_tourist_hotel[country] ['rel_freq'])) + list(cluster_tourist_hotel[country]['unique_reviews'])) file.close() print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute all') else: keywords = helper.getKeywords(originfile) for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): start_time = time.time() goforward = True print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) try: with open('resources/bow/' + keyword + '_' + emotion.lower() + '.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') tokens, cluster_tourist_hotel = cluster(csv_reader) csv_file.close() except: goforward = False if goforward: if not os.path.exists( 'resources/bow/tourist_hotel_country_freq/'): os.makedirs( 'resources/bow/tourist_hotel_country_freq/') with open('resources/bow/tourist_hotel_country_freq/' + keyword + '_' + emotion.lower() + '.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * 2 + ['unique IDs'] + tokens) for country in cluster_tourist_hotel.keys(): writer.writerow( [country[0], country[1]] + [cluster_tourist_hotel[country]['count_rev']] + list( map( "{:.15f}".format, cluster_tourist_hotel[country] ['rel_freq'])) + list(cluster_tourist_hotel[country] ['unique_reviews'])) file.close() print('------------------------------------------------------') print( str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)