def filter(self): SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None) FILTERED_DICTINARIES_CONF = config.get("FILTERED_DICTINARIES_CONF", None) if not (SAVE_DICTIONARY_DIR): print( "config keys - SAVE_DICTIONARY_DIR is not set correctly in the config file: socialconfig.py" ) exit(0) if not (FILTERED_DICTINARIES_CONF and len(FILTERED_DICTINARIES_CONF) > 0): print( "Configuration for filtering dictionaries is not defined in socialconfig.py" ) exit() SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR, "Unfiltered") if not os.path.exists( SAVE_UNFILTERED_DICTIONARY_DIR) and not os.path.isdir( SAVE_UNFILTERED_DICTIONARY_DIR): raise ("Directory {d} does not exist".format( d=SAVE_UNFILTERED_DICTIONARY_DIR)) SAVE_FILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR, "Filtered") if not (os.path.exists(SAVE_FILTERED_DICTIONARY_DIR) and os.path.isdir(SAVE_FILTERED_DICTIONARY_DIR)): os.makedirs(SAVE_FILTERED_DICTIONARY_DIR) for bdir, subdirs, files in os.walk(SAVE_UNFILTERED_DICTIONARY_DIR): if "negative" in bdir: continue if "positive" in bdir: continue if len(files) > 0: for file in files: if "_dict.dict" in file: dictionary = Dictionary.load(os.path.join(bdir, file)) yelp_category = file.split("_dict.dict")[0] if yelp_category == "food": FOOD_POSITIVE_SEEDS.extend(FOOD_NEGATIVE_SEEDS) keep_tokens = FOOD_POSITIVE_SEEDS elif yelp_category == "bars": BARS_POSITIVE_SEEDS.extend(BARS_NEGATIVE_SEEDS) keep_tokens = BARS_POSITIVE_SEEDS elif yelp_category == "leisure": LEISURE_POSITIVE_SEEDS.extend( LEISURE_NEGATIVE_SEEDS) keep_tokens = LEISURE_POSITIVE_SEEDS elif yelp_category == "grooming": GROOMING_POSITIVE_SEEDS.extend( GROOMING_NEGATIVE_SEEDS) keep_tokens = GROOMING_POSITIVE_SEEDS elif yelp_category == "health": HEALTH_POSITIVE_SEEDS.extend(HEALTH_NEGATIVE_SEEDS) keep_tokens = HEALTH_POSITIVE_SEEDS elif yelp_category == "learn": LEARN_POSITIVE_SEEDS.extend(LEARN_NEGATIVE_SEEDS) keep_tokens = LEARN_POSITIVE_SEEDS elif yelp_category == "municipal": MUNICIPAL_POSITIVE_SEEDS.extend( MUNICIPAL_NEGATIVE_SEEDS) keep_tokens = MUNICIPAL_POSITIVE_SEEDS elif yelp_category == "sports": SPORTS_POSITIVE_SEEDS.extend(SPORTS_NEGATIVE_SEEDS) keep_tokens = SPORTS_POSITIVE_SEEDS elif yelp_category == "planning": PLANNING_POSITIVE_SEEDS.extend( PLANNING_NEGATIVE_SEEDS) keep_tokens = PLANNING_POSITIVE_SEEDS elif yelp_category == "services": SERVICES_POSITIVE_SEEDS.extend( SERVICES_NEGATIVE_SEEDS) keep_tokens = SERVICES_POSITIVE_SEEDS elif yelp_category == "shopping": SHOPPING_POSITIVE_SEEDS.extend( SHOPPING_NEGATIVE_SEEDS) keep_tokens = SHOPPING_POSITIVE_SEEDS else: keep_tokens = [] keep_token_ids = [] for token in keep_tokens: l = [] l.append(token) res = dictionary.doc2bow(l, return_missing=True) found, missing = res if token in missing: continue else: keep_token_ids.append(found[0]) CATEGORY_SPECIFIC_FILTERED_DICT_DIR = os.path.join( SAVE_FILTERED_DICTIONARY_DIR, yelp_category) if not (os.path.exists( CATEGORY_SPECIFIC_FILTERED_DICT_DIR) and os.path.isdir( CATEGORY_SPECIFIC_FILTERED_DICT_DIR)): os.makedirs(CATEGORY_SPECIFIC_FILTERED_DICT_DIR) for conf_name, conf in FILTERED_DICTINARIES_CONF.items( ): keep_n = conf.get("keep_n", None) no_below = conf.get("no_below", None) if not (keep_n and no_below): print( "Dictionary config: `{configuration}` does not define values for keep_n and no_below, skipping." .format(configuration=conf_name)) continue else: save_in_dir = os.path.join( CATEGORY_SPECIFIC_FILTERED_DICT_DIR, conf_name) if not (os.path.exists(save_in_dir) and os.path.isdir(save_in_dir)): os.makedirs(save_in_dir) save_type_dict_file_name = os.path.join( save_in_dir, "{cat}_filtered_top_{keepn}_tokens_dict.dict" .format(cat=yelp_category, keepn=keep_n)) save_type_text_file_name = os.path.join( save_in_dir, "{cat}_filtered_top_{keepn}_tokens_dict.txt" .format(cat=yelp_category, keepn=keep_n)) save_type_token_docfreqs_file_name = os.path.join( save_in_dir, "{cat}_top_{keepn}_token_doc_freqs.txt". format(cat=yelp_category, keepn=keep_n)) dictionary.filter_extremes( keep_n=keep_n, no_below=no_below, keep_tokens=keep_token_ids) dictionary.save(save_type_dict_file_name) dictionary.save_as_text( save_type_text_file_name) sorted_doc_freqs = sorted( dictionary.dfs.items(), key=lambda x: x[1], reverse=True) with open(save_type_token_docfreqs_file_name, 'w') as socialfile: for (token_id, doc_freq) in sorted_doc_freqs: socialfile.write( str( dictionary.get( token_id, "Unknown"). encode("utf-8")) + " " + str(doc_freq) + "\n") print( "Saved filtered dictionary ({conf_type}) and token document frequencies for tokens in {category}-reviews in directory: {d}" .format( category=yelp_category, d= CATEGORY_SPECIFIC_FILTERED_DICT_DIR, conf_type=conf_name)) del sorted_doc_freqs del dictionary else: print("{file} ignored, not of type dict".format( file=file)) continue
from socialsent.polarity_induction_methods import graph_propagate from socialsent.polarity_induction_methods import dist from socialsent.evaluate_methods import binary_metrics from socialsent.representations.representation_factory import create_representation from textblob import TextBlob as tb from textblob import Word as wd import operator import re import traceback from gensim.models import Word2Vec from gensim.models import KeyedVectors if __name__ == "__main__": SAVE_WORD_EMBEDDINGS_DIR = config.get("SAVE_WORD_EMBEDDINGS_DIR", None) SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None) SAVE_POLARITIES_DIR = config.get("SAVE_POLARITIES_DIR", None) if not (SAVE_WORD_EMBEDDINGS_DIR and SAVE_DICTIONARY_DIR and SAVE_POLARITIES_DIR): print( "Keys (SAVE_WORD_EMBEDDINGS_DIR|SAVE_DICTIONARY_DIR|SAVE_POLARITIES_DIR) not set in the config file socialconfig.py" ) exit() SAVE_FILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR, "Filtered") if not (os.path.exists(SAVE_WORD_EMBEDDINGS_DIR) and os.path.isdir(SAVE_WORD_EMBEDDINGS_DIR)):
def process(self): # directory where stories are kept YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None) YELP_REVIEWS_FILE_NAME = config.get("YELP_REVIEWS_FILE_NAME", None) SAVE_REVIEWS_DIRECTORY = config.get("SAVE_REVIEWS_DIRECTORY", None) if not (YELP_DATASET_DIR and YELP_REVIEWS_FILE_NAME and SAVE_REVIEWS_DIRECTORY): print( "config keys are not set correctly in the config file: socialconfig.py" ) exit(0) if not (os.path.exists(YELP_DATASET_DIR) and os.path.isdir(YELP_DATASET_DIR)): print( "Either Yelp Dataset directory path is not set correctly in the socialconfig.py file \nOR\ \nThe directory does not exist. Please make sure you have downloaded the yelp dataset(in JSON format) and copied the `yelp_academic_dataset_business.json` and `yelp_academic_dataset_business.json` files into the yelp_dataset sub-directory of your project directory(socialsentRun)" ) exit() YELP_REVIEWS_ABS_FILE_PATH = os.path.join(YELP_DATASET_DIR, YELP_REVIEWS_FILE_NAME) YELP_BUSINESSES_FILE_NAME = config.get("YELP_BUSINESSES_FILE_NAME", None) YELP_BUSINESSES_ABS_FILE_PATH = os.path.join( YELP_DATASET_DIR, YELP_BUSINESSES_FILE_NAME) try: f = open(YELP_REVIEWS_ABS_FILE_PATH, 'r') if f: f.close() except IOError: msg = "Error opening file: {}".format((YELP_REVIEWS_FILE_NAME)) print(msg) print(traceback.format_exc()) exit() if not (os.path.exists(SAVE_REVIEWS_DIRECTORY) and os.path.isdir(SAVE_REVIEWS_DIRECTORY)): os.makedirs(SAVE_REVIEWS_DIRECTORY) print( "Created directory: {d} to save yelp reviews in buckets of 50,000 reviews per file" .format(d=SAVE_REVIEWS_DIRECTORY)) write_file_first = os.path.join(SAVE_REVIEWS_DIRECTORY, 'yelp_reviews_0_50000.json') wfile = open(write_file_first, 'w') reviews = get_reviews_iterable(YELP_REVIEWS_ABS_FILE_PATH) i = 0 print("Reading Yelp Reviews") for line in reviews: i += 1 json_dict = ujson.loads(line) review_text = json_dict.get("text", None) business_id = json_dict.get("business_id", None) review_id = json_dict.get("review_id", None) review_rating = json_dict.get("stars", None) if not (review_id or business_id or review_text or review_rating): continue review_text = review_text business_id = business_id review_id = review_id write_dict = {} write_dict['business_id'] = business_id write_dict['review_text'] = review_text write_dict['review_id'] = review_id write_dict['rating'] = review_rating wfile.write(ujson.dumps(write_dict) + "\n") if i % 100 == 0: print("{num}00 reviews processed".format(num=str(int(i / 100)))) if i % 50000 == 0: new_file = os.path.join( SAVE_REVIEWS_DIRECTORY, 'yelp_reviews_{from_num}_{to_num}.json'.format( from_num=(i + 1), to_num=(i + 50000))) old_file = write_file_first print("Changing file from {old_file} to {new_file}".format( old_file=old_file, new_file=new_file)) del wfile wfile = open(new_file, 'w') del wfile print("Processed {total} reviews".format(total=str(i)))
print("Excluded Category encountered: {cat}". format(cat=cat)) if review_counter % 100 == 0: print("{num}00 reviews processed".format( num=(str(int(review_counter / 100))))) if review_counter >= PROCESS_N_REVIEWS_ONLY: break print("{count} Reviews processed".format(count=review_counter)) if __name__ == "__main__": SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY = config.get( "SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY", None) if not SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY: print( "Cannot load business_id to business_category mappings as the location(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY) of the files that hold those mappings is not specified correctly in the config file socialconfig.py" ) exit() if not (os.path.exists(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY) and os.path.isdir(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)): print( "{SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY} is either not a valid directory or it does not exist" .format(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY= SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)) exit() # initialise class ps = segregate_reviews_by_category()
def process(self): # directory where reviews are kept SAVE_REVIEWS_DIRECTORY = config.get("SAVE_REVIEWS_DIRECTORY", None) SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get( "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None) PROCESS_N_REVIEWS_ONLY = int( config.get("PROCESS_N_REVIEWS_ONLY", 1000000)) print( "Will process only {num} reviews as per the directive in the socialconfig.py" .format(num=str(PROCESS_N_REVIEWS_ONLY))) if not (SAVE_REVIEWS_DIRECTORY and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): print( "config keys are not set correctly in the config file: socialconfig.py" ) exit(0) if not os.path.exists( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): os.makedirs(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) bars_file_counter, food_file_counter, grooming_file_counter, learn_file_counter, leisure_file_counter, municipal_file_counter, planning_file_counter, services_file_counter, shopping_file_counter, sports_file_counter, health_file_counter, other_file_counter = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 bars_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "bars") if not (os.path.exists(bars_dir) and os.path.isdir(bars_dir)): os.makedirs(bars_dir) bars_file_path = os.path.join(bars_dir, "yelp_reviews_bars_1.txt") food_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "food") if not (os.path.exists(food_dir) and os.path.isdir(food_dir)): os.makedirs(food_dir) food_file_path = os.path.join(food_dir, "yelp_reviews_food_1.txt") grooming_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "grooming") if not (os.path.exists(grooming_dir) and os.path.isdir(grooming_dir)): os.makedirs(grooming_dir) grooming_file_path = os.path.join(grooming_dir, "yelp_reviews_grooming_1.txt") learn_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "learn") if not (os.path.exists(learn_dir) and os.path.isdir(learn_dir)): os.makedirs(learn_dir) learn_file_path = os.path.join(learn_dir, "yelp_reviews_learn_1.txt") leisure_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "leisure") if not (os.path.exists(leisure_dir) and os.path.isdir(leisure_dir)): os.makedirs(leisure_dir) leisure_file_path = os.path.join(leisure_dir, "yelp_reviews_leisure_1.txt") municipal_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "municipal") if not (os.path.exists(municipal_dir) and os.path.isdir(municipal_dir)): os.makedirs(municipal_dir) municipal_file_path = os.path.join(municipal_dir, "yelp_reviews_municipal_1.txt") planning_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "planning") if not (os.path.exists(planning_dir) and os.path.isdir(planning_dir)): os.makedirs(planning_dir) planning_file_path = os.path.join(planning_dir, "yelp_reviews_planning_1.txt") services_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "services") if not (os.path.exists(services_dir) and os.path.isdir(services_dir)): os.makedirs(services_dir) services_file_path = os.path.join(services_dir, "yelp_reviews_services_1.txt") shopping_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "shopping") if not (os.path.exists(shopping_dir) and os.path.isdir(shopping_dir)): os.makedirs(shopping_dir) shopping_file_path = os.path.join(shopping_dir, "yelp_reviews_shopping_1.txt") sports_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "sports") if not (os.path.exists(sports_dir) and os.path.isdir(sports_dir)): os.makedirs(sports_dir) sports_file_path = os.path.join(sports_dir, "yelp_reviews_sports_1.txt") health_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "health") if not (os.path.exists(health_dir) and os.path.isdir(health_dir)): os.makedirs(health_dir) health_file_path = os.path.join(health_dir, "yelp_reviews_health_1.txt") other_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "other") if not (os.path.exists(other_dir) and os.path.isdir(other_dir)): os.makedirs(other_dir) other_file_path = os.path.join(other_dir, "yelp_reviews_other_1.txt") bars_file = open(bars_file_path, 'w') food_file = open(food_file_path, 'w') grooming_file = open(grooming_file_path, 'w') learn_file = open(learn_file_path, 'w') leisure_file = open(leisure_file_path, 'w') municipal_file = open(municipal_file_path, 'w') planning_file = open(planning_file_path, 'w') services_file = open(services_file_path, 'w') shopping_file = open(shopping_file_path, 'w') sports_file = open(sports_file_path, 'w') health_file = open(health_file_path, 'w') other_file = open(other_file_path, 'w') review_counter = 0 for file in os.listdir(SAVE_REVIEWS_DIRECTORY): if not os.path.isdir(file): abs_file_path = os.path.join(SAVE_REVIEWS_DIRECTORY, file) if "yelp_reviews_" in file: reviews = get_iterable(abs_file_path) for review in reviews: review_counter += 1 review_dict = ujson.loads(review) business_id = review_dict.get("business_id", None) review_text = review_dict.get("review_text", None) wiki = tb(review_text) tags = wiki.tags adj_words_list = [ wd(fword).lemmatize("a") for fword in [ self.strip_special_chars(word.lower()) for word, tag in tags if tag in ["JJ", "JJR", "JJS"] and len(word) > 2 ] if len(fword) > 2 ] adj_list_string = " ".join(adj_words_list) review_dict.update({"adjectives": adj_list_string}) write_line = ujson.dumps(review_dict) cat_list = self.business_to_cat.get(business_id, []) for cat in cat_list: if cat == "bars": bars_file.write(write_line + "\n") bars_file_counter += 1 if bars_file_counter % 25000 == 0: del bars_file old_bars_file = bars_file_path new_bars_file = os.path.join( bars_dir, "yelp_reviews_bars_{c}.txt".format( c=str( int((bars_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `BARS` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_bars_file} to {new_bars_file}' .format(old_bars_file=old_bars_file, new_bars_file=new_bars_file)) bars_file = open(new_bars_file, 'w') elif cat == "food": food_file.write(write_line + "\n") food_file_counter += 1 if food_file_counter % 25000 == 0: del food_file old_food_file = food_file_path new_food_file = os.path.join( food_dir, "yelp_reviews_food_{c}.txt".format( c=str( int((food_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `food` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_food_file} to {new_food_file}' .format(old_food_file=old_food_file, new_food_file=new_food_file)) food_file = open(new_food_file, 'w') elif cat == "grooming": grooming_file.write(write_line + "\n") grooming_file_counter += 1 if grooming_file_counter % 25000 == 0: del grooming_file old_grooming_file = grooming_file_path new_grooming_file = os.path.join( grooming_dir, "yelp_reviews_grooming_{c}.txt".format( c=str( int((grooming_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `grooming` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_grooming_file} to {new_grooming_file}' .format( old_grooming_file=old_grooming_file, new_grooming_file=new_grooming_file )) grooming_file = open( new_grooming_file, 'w') elif cat == "learn": learn_file.write(write_line + "\n") learn_file_counter += 1 if learn_file_counter % 25000 == 0: del learn_file old_learn_file = learn_file_path new_learn_file = os.path.join( learn_dir, "yelp_reviews_learn_{c}.txt".format( c=str( int((learn_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `learn` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_learn_file} to {new_learn_file}' .format(old_learn_file=old_learn_file, new_learn_file=new_learn_file)) learn_file = open(new_learn_file, 'w') elif cat == "leisure": leisure_file.write(write_line + "\n") leisure_file_counter += 1 if leisure_file_counter % 25000 == 0: del leisure_file old_leisure_file = leisure_file_path new_leisure_file = os.path.join( learn_dir, "yelp_reviews_leisure_{c}.txt".format( c=str( int((leisure_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `leisure` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_leisure_file} to {new_leisure_file}' .format( old_leisure_file=old_leisure_file, new_leisure_file=new_leisure_file)) leisure_file = open(new_leisure_file, 'w') elif cat == "municipal": municipal_file.write(write_line + "\n") municipal_file_counter += 1 if municipal_file_counter % 25000 == 0: del municipal_file old_municipal_file = municipal_file_path new_municipal_file = os.path.join( municipal_dir, "yelp_reviews_municipal_{c}.txt". format(c=str( int((municipal_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `municipal` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_municipal_file} to {new_municipal_file}' .format(old_municipal_file= old_municipal_file, new_municipal_file= new_municipal_file)) municipal_file = open( new_municipal_file, 'w') elif cat == "planning": planning_file.write(write_line + "\n") planning_file_counter += 1 if planning_file_counter % 25000 == 0: del planning_file old_planning_file = planning_file_path new_planning_file = os.path.join( planning_dir, "yelp_reviews_planning_{c}.txt".format( c=str( int((planning_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `planning` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_planning_file} to {new_planning_file}' .format( old_planning_file=old_planning_file, new_planning_file=new_planning_file )) planning_file = open( new_planning_file, 'w') elif cat == "services": services_file.write(write_line + "\n") services_file_counter += 1 if services_file_counter % 25000 == 0: del services_file old_services_file = services_file_path new_services_file = os.path.join( services_dir, "yelp_reviews_services_{c}.txt".format( c=str( int((services_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `services` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_services_file} to {new_services_file}' .format( old_services_file=old_services_file, new_services_file=new_services_file )) services_file = open( new_services_file, 'w') elif cat == "shopping": shopping_file.write(write_line + "\n") shopping_file_counter += 1 if shopping_file_counter % 25000 == 0: del shopping_file old_shopping_file = shopping_file_path new_shopping_file = os.path.join( shopping_dir, "yelp_reviews_shopping_{c}.txt".format( c=str( int((shopping_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `shopping` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_shopping_file} to {new_shopping_file}' .format( old_shopping_file=old_shopping_file, new_shopping_file=new_shopping_file )) shopping_file = open( new_shopping_file, 'w') elif cat == "sports": sports_file.write(write_line + "\n") sports_file_counter += 1 if sports_file_counter % 25000 == 0: del sports_file old_sports_file = sports_file_path new_sports_file = os.path.join( sports_dir, "yelp_reviews_sports_{c}.txt".format( c=str( int((sports_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `sports` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_sports_file} to {new_sports_file}' .format( old_sports_file=old_sports_file, new_sports_file=new_sports_file)) sports_file = open(new_sports_file, 'w') elif cat == "health": health_file.write(write_line + "\n") health_file_counter += 1 if health_file_counter % 25000 == 0: del health_file old_health_file = health_file_path new_health_file = os.path.join( health_dir, "yelp_reviews_health_{c}.txt".format( c=str( int((health_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `health` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_health_file} to {new_health_file}' .format( old_health_file=old_health_file, new_health_file=new_health_file)) health_file = open(new_health_file, 'w') else: other_file.write(write_line + "\n") other_file_counter += 1 if other_file_counter % 25000 == 0: del other_file old_other_file = other_file_path new_other_file = os.path.join( other_dir, "yelp_reviews_other_{c}.txt".format( c=str( int((other_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `other` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_other_file} to {new_other_file}' .format(old_other_file=old_other_file, new_other_file=new_other_file)) other_file = open(new_other_file, 'w') print("Excluded Category encountered: {cat}". format(cat=cat)) if review_counter % 100 == 0: print("{num}00 reviews processed".format( num=(str(int(review_counter / 100))))) if review_counter >= PROCESS_N_REVIEWS_ONLY: break print("{count} Reviews processed".format(count=review_counter))
def process(self): # directory where stories are kept YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None) YELP_BUSINESSES_FILE_NAME = config.get("YELP_BUSINESSES_FILE_NAME", None) SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY = config.get( "SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY", None) SAVE_BUSINESSES_BY_STATE_DIRECTORY = config.get( "SAVE_BUSINESSES_BY_STATE_DIRECTORY", None) if not (YELP_DATASET_DIR and YELP_BUSINESSES_FILE_NAME and SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY and SAVE_BUSINESSES_BY_STATE_DIRECTORY): print( "config keys are not set correctly in the config file: socialconfig.py" ) exit(0) if not (os.path.exists(YELP_DATASET_DIR) and os.path.isdir(YELP_DATASET_DIR)): print( "Either Yelp Dataset directory path is not set correctly in the socialconfig.py file \nOR\ \nThe directory does not exist. Please make sure you have downloaded the yelp dataset(in JSON format) and copied the `yelp_academic_dataset_business.json` and `yelp_academic_dataset_business.json` files into the yelp_dataset sub-directory of your project directory(socialsentRun)" ) exit() YELP_BUSINESSES_ABS_FILE_PATH = os.path.join( YELP_DATASET_DIR, YELP_BUSINESSES_FILE_NAME) try: f = open(YELP_BUSINESSES_ABS_FILE_PATH, 'r') if f: f.close() except IOError: msg = "Error opening file: {f}".format( f=YELP_BUSINESSES_ABS_FILE_PATH) print(msg) print(traceback.format_exc()) exit() if not (os.path.exists(SAVE_BUSINESSES_BY_STATE_DIRECTORY) and os.path.isdir(SAVE_BUSINESSES_BY_STATE_DIRECTORY)): os.makedirs(SAVE_BUSINESSES_BY_STATE_DIRECTORY) if not (os.path.exists(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY) and os.path.isdir(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)): os.makedirs(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY) yelp_food_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_food.json") yelp_travel_and_leisure_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_travel_and_leisure.json") yelp_health_and_doctor_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_health_and_doctor.json") yelp_sports_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_sports.json") yelp_bars_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_bars.json") yelp_shopping_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_shopping.json") yelp_grooming_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_grooming.json") yelp_learning_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_learning.json") yelp_advice_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_advice.json") yelp_services_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_services.json") yelp_municipal_businesses_file = os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_category_municipal.json") businesses = get_businesses_iterable(YELP_BUSINESSES_ABS_FILE_PATH) i = 0 for line in businesses: i += 1 json_dict = ujson.loads(line) bus_id = json_dict.get("business_id", None) state = json_dict.get("state", None) city = json_dict.get("city", None) sub_cats = json_dict.get("categories", None) rev_count = json_dict.get("review_count", None) classified_categories = set() if not (sub_cats and bus_id): continue for sub_cat in sub_cats.split(","): sub_cat = sub_cat.strip() for classfied_category in category_dict: for subcat in category_dict[classfied_category]: if sub_cat == subcat: classified_categories.add(classfied_category) write_files = [] for final_cat in classified_categories: write_files.append( os.path.join( SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY, "yelp_businesses_in_category_{final_cat}.csv".format( final_cat=final_cat))) if state: write_files.append( os.path.join( SAVE_BUSINESSES_BY_STATE_DIRECTORY, "yelp_businesses_in_state_{state}.csv".format( state=state))) for file in write_files: if os.path.exists(file) and os.path.isfile(file): write_mode = 'a' else: write_mode = 'w' if not state: state = "NoStateName_EncodingIssue" if not city: city = "NoCityName_EncodingIssue" if not rev_count: rev_count = 0 write_str = ujson.dumps(' '.join( [bus_id, state, city, str(rev_count)])) with open(file, write_mode) as wfile: try: wfile.write(write_str + "\n") except: print("Cannot write business_id:{bus_id} to {file}". format(bus_id=bus_id, file=file)) print(traceback.format_exc()) continue if i % 100 == 0: print("{num}00 businesses processed".format( num=str(int(i / 100)))) print("Processed {num} businesses".format(num=str(i)))
"!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~")) return word.rstrip("@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~").lstrip( "!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~") else: return "" def has_special_chars(self, word): if word[0] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~" or word[ -1] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~": return True return False if __name__ == "__main__": SAVE_WORD_EMBEDDINGS_DIR = config.get("SAVE_WORD_EMBEDDINGS_DIR", None) SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None) SAVE_WORD2VEC_MODELS_DIR = config.get("SAVE_WORD2VEC_MODELS_DIR", None) REVIEWS_CATEGORIES_LIST = config.get("REVIEWS_CATEGORIES_LIST", None) WORD2VEC_MODELS_CONF = config.get("WORD2VEC_MODELS_CONF", None) if not (REVIEWS_CATEGORIES_LIST and len(REVIEWS_CATEGORIES_LIST) > 0): print( "Categories List is not defined in socialconfig.py, cannot determine the paths to category-specific word2vec stored models" ) exit() if not (SAVE_WORD_EMBEDDINGS_DIR and SAVE_DICTIONARY_DIR and SAVE_WORD2VEC_MODELS_DIR): print( "config keys (SAVE_WORD_EMBEDDINGS_DIR and SAVE_DICTIONARY_DIR and SAVE_WORD2VEC_MODELS_DIR) are not set correctly in the config file: socialconfig.py"
import numpy as np import ujson from sklearn.preprocessing import MinMaxScaler, StandardScaler import re import sys from textblob import TextBlob as tb from collections import defaultdict from socialconfig import config from textblob import TextBlob as tb from textblob import Word as wd scaler = MinMaxScaler(feature_range=(-1, 1)) SAVE_POLARITIES_DIR = config.get("SAVE_POLARITIES_DIR", None) SAVE_REVIEWS_DIRECTORY = config.get("SAVE_REVIEWS_DIRECTORY", None) EVALUATE_SENTIMENTS_DIR = config.get("EVALUATE_SENTIMENTS_DIR", None) if not SAVE_POLARITIES_DIR: print( "SAVE_POLARITIES_DIR is not defined in socialconfig.py, can not load embeddings" ) exit() if not SAVE_REVIEWS_DIRECTORY: print( "SAVE_REVIEWS_DIRECTORY is not defined in socialconfig.py, can not load documents to evaluate polarities" ) exit() for category_entity in os.listdir(SAVE_POLARITIES_DIR):
def create_dictionary(self): YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None) SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get( "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None) SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None) SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None) SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int( config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000)) if not (YELP_DATASET_DIR and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY and SAVE_DICTIONARY_DIR and SAVE_BAG_OF_WORDS_DIR and SAVE_DICTIONARY_DIR): print( "config keys are not set correctly in the config file: socialconfig.py" ) exit(0) SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR, "Unfiltered") if not os.path.exists( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): raise ("Directory {d} does not exist".format( d=SAVE_REVIEWS_BY_CATEGORY_DIRECTORY)) if not (os.path.exists(SAVE_BAG_OF_WORDS_DIR) and os.path.isdir(SAVE_BAG_OF_WORDS_DIR)): os.makedirs(SAVE_BAG_OF_WORDS_DIR) if not (os.path.exists(SAVE_UNFILTERED_DICTIONARY_DIR) and os.path.isdir(SAVE_UNFILTERED_DICTIONARY_DIR)): os.makedirs(SAVE_UNFILTERED_DICTIONARY_DIR) for pardir, sub_dirs, files in os.walk( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): if len(files) > 0: error_count = 0 review_docs = [] negative_docs = [] positive_docs = [] doc_count = 0 docs_per_file = SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE file_num = str((doc_count / docs_per_file) + 1) for file in files: if "yelp_reviews_" in file and "category" in pardir: reviews = get_reviews_iterable( os.path.join(pardir, file)) yelp_category = pardir.split('/')[-1] CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category) if not (os.path.exists( CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) and os.path .isdir(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)): os.makedirs(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) fname = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category, "{cat}_file_{file_num}.txt".format( cat=yelp_category, file_num=file_num)) bow_file = open(fname, 'w') print( "Writing docs (in bag of words form) for {cat} to directory: {d}" .format(cat=yelp_category, d=os.path.join(SAVE_BAG_OF_WORDS_DIR, yelp_category))) for review in reviews: try: review_dict = ujson.loads(review) except: error_count += 1 pass adjs = review_dict.get("adjectives", None) rating = int(review_dict.get("rating", -1)) if adjs: doc_count += 1 bow_file.write( ujson.dumps(adjs.encode("utf-8")) + "\n") review_docs.append(adjs.strip().split()) if (doc_count % docs_per_file) == 0: if bow_file: bow_file.close() file_num = str((doc_count / docs_per_file) + 1) fname = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category, "{cat}_file_{file_num}.txt".format( cat=yelp_category, file_num=file_num)) bow_file = open(fname, 'w') if rating: if rating > 3: positive_docs.append(adjs.strip().split()) elif rating < 3: negative_docs.append(adjs.strip().split()) else: pass print("Wrote {total} docs in {cat} category".format( total=str(doc_count), cat=yelp_category)) dictionary = Dictionary(review_docs) CATEGORY_SPECIFIC_DICT_DIR = os.path.join( SAVE_UNFILTERED_DICTIONARY_DIR, yelp_category) POSITIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR, "positive") NEGATIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR, "negative") if not (os.path.exists(CATEGORY_SPECIFIC_DICT_DIR) and os.path.isdir(CATEGORY_SPECIFIC_DICT_DIR)): os.makedirs(CATEGORY_SPECIFIC_DICT_DIR) os.makedirs(POSITIVE_SUB_DIR) os.makedirs(NEGATIVE_SUB_DIR) dictionary.save( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_dict.dict".format( yelp_category=yelp_category))) dictionary.save_as_text( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_dict.txt".format( yelp_category=yelp_category))) sorted_doc_freqs = sorted(dictionary.dfs.items(), key=lambda x: x[1], reverse=True) # print("Will save file in:\n " + os.path.join(CATEGORY_SPECIFIC_DICT_DIR,"{yelp_category}_dict.txt".format(yelp_category=yelp_category))) with open( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_words_doc_frequencies.txt".format( yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_doc_freqs: df_file.write( str( dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del dictionary del review_docs del sorted_doc_freqs pos_dictionary = Dictionary(positive_docs) del positive_docs neg_dictionary = Dictionary(negative_docs) del negative_docs pos_dictionary.save( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_dict.dict".format( yelp_category=yelp_category))) pos_dictionary.save_as_text( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_dict.txt".format( yelp_category=yelp_category))) sorted_pos_doc_freqs = sorted(pos_dictionary.dfs.items(), key=lambda x: x[1], reverse=True) with open( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_words_doc_frequencies.txt". format(yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_pos_doc_freqs: df_file.write( str( pos_dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del pos_dictionary del sorted_pos_doc_freqs neg_dictionary.save( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_dict.dict".format( yelp_category=yelp_category))) neg_dictionary.save_as_text( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_dict.txt".format( yelp_category=yelp_category))) sorted_neg_doc_freqs = sorted(neg_dictionary.dfs.items(), key=lambda x: x[1], reverse=True) with open( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_words_doc_frequencies.txt". format(yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_neg_doc_freqs: df_file.write( str( neg_dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del neg_dictionary del sorted_neg_doc_freqs print( "{count} {cat} reviews were discarded because of parsing errors" .format(count=error_count, cat=yelp_category)) print("Created dictionary for {cat} tokens".format( cat=yelp_category))
"!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~")) return word.rstrip("@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~").lstrip( "!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~") else: return "" def has_special_chars(self, word): if word[0] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~" or word[ -1] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~": return True return False if __name__ == "__main__": SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None) SAVE_WORD2VEC_MODELS_DIR = config.get("SAVE_WORD2VEC_MODELS_DIR", None) PROCESS_N_REVIEWS_ONLY_PER_CATEGORY = int( config.get("PROCESS_N_REVIEWS_ONLY_PER_CATEGORY", 25000)) SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int( config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000)) FILTERED_DICTINARIES_CONF = config.get("FILTERED_DICTINARIES_CONF", {}) FILTER_DICTIONARY_TOP_N_WORDS_PER_CATEGORY = int( config.get("FILTER_DICTIONARY_TOP_N_WORDS_PER_CATEGORY", 10000)) SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None) WORD2VEC_MODELS_CONF = config.get("WORD2VEC_MODELS_CONF", None) if not (WORD2VEC_MODELS_CONF and isinstance(WORD2VEC_MODELS_CONF, dict) and len(WORD2VEC_MODELS_CONF)): print( "No Model Configurations specified in the socialconfig.py file for training the Word2vec models"