# First generate the pickles with all the data in python format # Although this could be ignored and directly generate the relevant data, # having the files can be nice for future expansion of the data pickles_from_json(NUM_PARTITIONS) # We need business data to filter reviews from outside of the US (only English) business_data = get_business_data() # Get data from partitions created, partition by partition review_texts = [] useful_votes = [] funny_votes = [] cool_votes = [] review_stars = [] for partition in range(1, NUM_PARTITIONS + 1): data = get_reviews_data((partition, ), business_data, not_include_states=["EDH", "QC", "BW"]) (texts, useful, funny, cool, stars) = data review_texts.extend(texts) useful_votes.extend(useful) funny_votes.extend(funny) cool_votes.extend(cool) review_stars.extend(stars) # Generate dataset funny reviews reviews, labels = give_balanced_classes(review_texts, funny_votes, votes_threshold=3) result = create_data_sets(reviews, labels, write_to_pickle=True, problem="funny") (train_reviews, train_labels, dev_reviews, dev_labels, test_reviews, test_labels) = result # Generate dataset of useful reviews reviews, labels = give_balanced_classes(review_texts, useful_votes, votes_threshold=3) result = create_data_sets(reviews, labels, write_to_pickle=True, problem="useful")
""" Compute WordVectors using Yelp Data """ from gensim.models.word2vec import Word2Vec from util.language import detect_language, tokenize_text from data_handling import get_reviews_data # Set to true for zero in in English reviews. Makes the process much slower FILTER_ENGLISH = True # Name for output w2v model file OUTPUT_MODEL_FILE = "w2v_yelp_100_alpha_0.025_window_4" PICKLED_DATA = "/home/alfredo/deep-nlp/data/reviews.pickle." NUM_PARTITIONS = 2 # Use all data reviews_texts, _, _, _, _ = get_reviews_data(range(1, NUM_PARTITIONS), PICKLED_DATA) # Each review will be considered a sentence sentences = [] for num, text in enumerate(reviews_texts): if num % 10000 == 0: print "%d out of %d reviews read" % (num, len(reviews_texts)) if FILTER_ENGLISH: if detect_language(text) == u"english": sentences.append(tokenize_text(text)) else: sentences.append(text) # Build a w2v model w2v = Word2Vec(sentences=sentences, size=100,
""" Compute WordVectors using Yelp Data """ from gensim.models.word2vec import Word2Vec from util.language import detect_language, tokenize_text from data_handling import get_reviews_data # Set to true for zero in in English reviews. Makes the process much slower FILTER_ENGLISH = True # Name for output w2v model file OUTPUT_MODEL_FILE = "w2v_yelp_100_alpha_0.025_window_4" PICKLED_DATA = "/home/alfredo/deep-nlp/data/reviews.pickle." NUM_PARTITIONS = 2 # Use all data reviews_texts, _, _, _, _ = get_reviews_data(range(1, NUM_PARTITIONS), PICKLED_DATA) # Each review will be considered a sentence sentences = [] for num, text in enumerate(reviews_texts): if num % 10000 == 0: print "%d out of %d reviews read" % (num, len(reviews_texts)) if FILTER_ENGLISH: if detect_language(text) == u"english": sentences.append(tokenize_text(text)) else: sentences.append(text) # Build a w2v model w2v = Word2Vec(sentences=sentences, size=100, alpha=0.025, window=4, min_count=2, sample=1e-5, workers=4, negative=10) w2v.save(OUTPUT_MODEL_FILE)