def parse_user(): """Load users output: id2user.pkl, user2id.pkl, user.friend.pkl, user.profile.pkl """ user_profile = {} user_friend = {} make_dir(PREPROCESS_DIR) print("\t[parse user] load user list") users_list = load_pkl(PREPROCESS_DIR + "users_list.pkl") users_list = set(users_list) print("\t[parse user] building user profiles") with open(DATA_DIR + "user.json", "r") as fin: for ind, ln in enumerate(fin): data = json.loads(ln) user_id = data['user_id'] if user_id not in users_list: # discard infrequent or irrelevant cities continue user_friend[user_id] = data['friends'].split(", ") del data['friends'] del data['user_id'] user_profile[user_id] = data # user adjacency and profile dictionary separately print( "\t[parse user] dumping user-friendship and user-profile information ..." ) dump_pkl(PREPROCESS_DIR + "user_friend.pkl", user_friend) dump_pkl(PREPROCESS_DIR + "user_profile.pkl", user_profile)
def compute_user_avg_loc(city): """compute average latitude and longitude of businesses each user visited Arg: city - the city """ print("\t[user] computing location features") # df = pd.read_csv(TRNTST_DIR + "{}/train_pos.csv".format(city)) df = pd.read_csv(INPUT_DIR + "{}/user_business_interaction.csv".format(city)) bus_profile = load_pkl(INPUT_DIR + "{}/city_business_profile.pkl".format(city)) # df.assign(business_latitude=lambda x: bus_profile[x.business]["latitude"]) # df.assign(business_longitude=lambda x: bus_profile[x.business]["longitude"]) b_lat_dict = dict([(k, v["latitude"]) for k, v in bus_profile.items()]) b_long_dict = dict([(k, v["longitude"]) for k, v in bus_profile.items()]) df = df.assign(bus_lat=df.business.map(b_lat_dict)) df = df.assign(bus_long=df.business.map(b_long_dict)) # "ll": latitude and longitude print("\t[user] aggregating location (lat and long) by user") df_loc = df.groupby("user").agg({"bus_lat": ['max', 'min', 'mean'], "bus_long": ['max', 'min', 'mean']}) # rename max, min, mean col to max_lat, min_lat, or mean_at. Same as `long` # while still maintaining the index as `user` user_lat = df_loc.bus_lat.reset_index() user_long = df_loc.bus_long.reset_index() user_loc = user_lat.join(user_long, on="user", how="outer", lsuffix="_lat", rsuffix="_long") user_loc = user_loc.fillna(user_loc.mean()) # now `user` is column user_loc_dict = user_loc.set_index("user").to_dict(orient="index") dump_pkl(OUTPUT_DIR + "{}/city_user_loc.pkl".format(city), user_loc_dict)
def gather_genre(genre, limit_videos=100): print "Gathering features for", genre, genreFeatures = gather_videos(genre, limit_videos) print "OK." print genreFeatures.shape dump_pkl(genreFeatures, genre + str(limit_videos))
def gather_optical_flow_features(genre, limit_videos = None): genre_OF_features = [] videoPaths = glob(os.path.join(video_resource,'train',genre)+'/*')[:limit_videos] for videoPath in videoPaths: videoFeatures = optical_flow(videoPath) print videoFeatures.shape genre_OF_features.append(videoFeatures) print "*"*90 dump_pkl(genre_OF_features, genre+"_ultimate_OF")
def generate_precision_recall_text(mode='val'): model = load_moviescope_model('text') yTrue, plotFeatures = gather_features(mode, return_video=False, reverse=True) plotFeatures = np.array(map(list, zip(*plotFeatures))) yPreds = model.predict([plotFeatures[0], plotFeatures[1]]) dump_pkl((yTrue, yPreds), mode+'_pred_text') return
def train_classifier_video(trainVideoFeatures, trainLabels, valVideoFeatures=None, valLabels=None): input_dim = 4096 trainingLabels, trainingFeatures = augment_labels_lstm( trainLabels, trainVideoFeatures, number_of_frames) print trainingLabels.shape print trainingFeatures.shape """Initialize the mode""" visInput = Input(shape=(number_of_frames, input_dim), dtype='float32') model = vis_model(visInput, number_of_classes, return_top=True) plot(model, to_file='vis_model.png', show_shapes=True) sgd = SGD(lr=0.01, decay=0.000001, momentum=0.9, nesterov=True) # suppressing SGD, since text and merged models are optimized using ADAM model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy']) """Start training""" batch_size = 63 nb_epoch = 50 checkpoint = ModelCheckpoint(filepath='./data/models/wiki_im_video_sgd.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint, remote] if valLabels is not None: valLabels, valFeatures = augment_labels_lstm(valLabels, valVideoFeatures, number_of_frames) hist = model.fit(trainingFeatures, trainingLabels, validation_data=(valFeatures, valLabels), nb_epoch=nb_epoch, batch_size=batch_size, callbacks=callbacks_list) else: hist = model.fit(trainingFeatures, trainingLabels, nb_epoch=nb_epoch, batch_size=batch_size, callbacks=callbacks_list) model.save('data/models/video_sgd.h5') histDict = hist.history dump_pkl(histDict, 'hist_video_sgd') return model
def generate_precision_recall_video(mode='val'): model = load_moviescope_model('wiki_im_video_sgd') yTrue, videoFeatures = gather_features(mode, return_plot=False) _, videoFeatures = augment_labels_lstm(yTrue, videoFeatures, 200) yPreds = model.predict(videoFeatures) dump_pkl((yTrue, yPreds), mode+'_pred_video_sgd') return
def get_raw_data(mode='val'): rawData = [] #Include dictionaries containing plot and genre labels & movie_id allData = load_pkl(baseName+mode) for data in allData: movie_id = data['movie_id'] plot = data['plot'] genreLabel = data['newGenreLabels'] rawData.append({'movie_id':movie_id,'plot':plot,'newGenreLabels':genreLabel}) dump_pkl(rawData, 'raw_data_'+mode)
def _get_raw_data(mode='val'): rawData = [] #Include dictionaries containing trailer path, plot and genre labels & movie_id allData = load_pkl(baseName+mode) for data in allData: movie_id = data['movie_id'] path = glob(video_resource+str(movie_id)+'.*')[0] plot = data['plot'] genreLabel = data['genreLabel'] rawData.append({'movie_id':movie_id,'plot':plot,'path':path,'genreLabel':genreLabel}) dump_pkl(rawData, 'raw_data_'+mode)
def generate_precision_recall_vislang(mode='val', merge_mode='sum'): if merge_mode == 'bilinear': model = vislang_model(merge_mode) model.load_weights('data/weights/weights_min_loss_%s.h5' % merge_mode) else: model = load_moviescope_model('eq_VisLang_%s' % merge_mode) yTrue, plotFeatures, videoFeatures = gather_features(mode, reverse=True) plotFeatures = np.array(map(list, zip(*plotFeatures))) _, videoFeatures = augment_labels_lstm(yTrue, videoFeatures, 200) yPreds = model.predict([videoFeatures, plotFeatures[0], plotFeatures[1]]) dump_pkl((yTrue, yPreds), mode+'_pred_eq_vislang_'+merge_mode)
def train_from_scratch(config, state, channel): # Model options save_model_dir = config[config.model].save_model_dir if save_model_dir == 'current': config[config.model].save_model_dir = './' save_model_dir = './' # to facilitate the use of cluster for multiple jobs save_path = './model_config.pkl' else: # run locally, save locally save_path = save_model_dir + 'model_config.pkl' print 'current save dir ', save_model_dir utils.create_dir_if_not_exist(save_model_dir) reload_ = config[config.model].reload_ if reload_: print 'preparing reload' save_dir_backup = config[config.model].save_model_dir from_dir_backup = config[config.model].from_dir # never start retrain in the same folder assert save_dir_backup != from_dir_backup print 'save dir ', save_dir_backup print 'from_dir ', from_dir_backup print 'setting current model config with the old one' model_config_old = utils.load_pkl(from_dir_backup + '/model_config.pkl') utils.set_config(config, model_config_old) config[config.model].save_model_dir = save_dir_backup config[config.model].from_dir = from_dir_backup config[config.model].reload_ = True if config.erase_history: print 'erasing everything in ', save_model_dir os.system('rm %s/*' % save_model_dir) # for stdout file logging # sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log') print 'saving model config into %s' % save_path utils.dump_pkl(config, save_path) # Also copy back from config into state. for key in config: setattr(state, key, config[key]) model_type = config.model print 'Model Type: %s' % model_type print 'Command: %s' % ' '.join(sys.argv) t0 = time.time() print 'training an attention model' train(**state.attention) if channel: channel.save() print 'training time in total %.4f sec' % (time.time() - t0)
def train_classifier_word_embedding(trainPlotFeatures, trainLabels, valPlotFeatures=None, valLabels=None): sequence_input = Input(shape=(3000, ), dtype='int32') sequence_input_reverse = Input(shape=(3000, ), dtype='int32') textModel = good_text_model(sequence_input, sequence_input_reverse, use_embedding=True, trainable=False) textModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) checkpoint = ModelCheckpoint(filepath='./data/models/text_checkpoint.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max') checkpoint_loss = ModelCheckpoint( filepath='./data/models/text_checkpoint_loss.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint, remote, checkpoint_loss] if valLabels is not None: hist = textModel.fit( [trainPlotFeatures[0], trainPlotFeatures[1]], trainLabels, validation_data=([valPlotFeatures[0], valPlotFeatures[1]], valLabels), nb_epoch=50, batch_size=63, callbacks=callbacks_list) else: hist = textModel.fit([trainPlotFeatures[0], trainPlotFeatures[1]], trainLabels, nb_epoch=100, batch_size=63, callbacks=callbacks_list) textModel.save('data/models/_text.h5') print "Model saved at: data/models/_text.h5" histDict = hist.history dump_pkl(histDict, 'hist_text')
def gather_training_data(genre, model_name=default_model_name): trainPath = os.path.join(video_resource, 'train', genre) videoPaths = glob(trainPath + '/*') genreFeatures = [] for videoPath in videoPaths: print(videoPath, ":") frames = list(get_frames(videoPath, time_step=1000)) print(len(frames)) if len(frames) == 0: print("corrupt.") continue videoFeatures = get_features_batch(frames, model_name) print(videoFeatures.shape) genreFeatures.append(videoFeatures) outPath = genre + "_ultimate_" + model_name dump_pkl(genreFeatures, outPath)
def _get_features_data(mode='val'): """ deprecated with old dataset """ """ Includes every sample with plotFeatures, videoFeatures, movie_id and genreLabel """ featureData = [] allData = load_pkl(baseName+mode) plots = [] """Process plot vectors""" for data in allData: movie_id = data['movie_id'] plot = data['plot'] plots.append(plot) if mode=='train': textObj = Text() plotFeatures_all = textObj.fit_transform(plots) dump_pkl(textObj, 'plot_object_train') else: try: textObj = load_pkl('plot_object_train') plotFeatures_all = textObj.transform(plots).toarray() except: print "Please train the plots first." return plotIndex = -1 for data in allData: plotIndex += 1 movie_id = data['movie_id'] path = glob(video_resource+str(movie_id)+'.*')[0] plot = data['plot'] genreLabel = data['genreLabel'] print plotIndex,"out of ",len(allData) print "Gathering features for",movie_id try: frames = list(get_frames(path, start_time=1000, end_time=200000, time_step=1000)) videoFeatures = get_features_batch(frames, 'vgg16') except Exception as e: print e continue # Omit the movie if one of the feature is bad # videoFeatures = None plotFeatures = plotFeatures_all[plotIndex] featureData.append({'videoFeatures':videoFeatures, 'plotFeatures':plotFeatures, 'movie_id':movie_id, 'genreLabel':genreLabel}) dump_pkl(featureData, 'feature_data_'+mode)
def gather_genre(genre, limit_videos=100): print "Gathering features for",genre videoPaths = glob(video_resource+genre+'/*')[:limit_videos] genreFeatures = [] for videoPath in videoPaths: videoFeatures = [] print "extracting features for",videoPath for frame in get_frames(videoPath, time_step=1000): frameFeatures = get_features(frame) videoFeatures.append(frameFeatures) videoFeatures = np.array(videoFeatures) genreFeatures.append(videoFeatures) genreFeatures = np.array(genreFeatures) print "OK." print genreFeatures.shape dump_pkl(genreFeatures, genre+str(limit_videos))
def return_confident_results(mode='val'): model = load_moviescope_model('wiki_im_VisLang') genrePredictionDict = dict((i,[]) for i in range(26)) textObj = load_pkl('plot_object_train') labels, plotFeatures, videoFeatures, movieIds = gather_features(mode, return_id=True) _, videoFeatures = augment_labels_lstm(labels, videoFeatures, 200) predictionScores = model.predict([videoFeatures, plotFeatures]) for index in range(len(predictionScores)): for i in range(26): genrePredictionDict[i].append((predictionScores[index][i],movieIds[index])) dump_pkl(genrePredictionDict, 'genrePredictionDict_'+mode) for i in range(26): print sorted(genrePredictionDict[i], reverse=True)[:10] return
def parse_interactions(): """draw interact from `review.json` and `tips.json`. output: ub.interact.csv Args: keep_city - the interact of cities to keep """ # business_profile only contains city in Lv, Tor, and Phx print("\t[parse interactions] loading business_profile pickle...") business_profile = load_pkl(PREPROCESS_DIR + "business_profile.pkl") users, businesses, cities = [], [], [] timestamps = [] # create records as (user, business, city) tuple print("\t[parse interactions] loading review.json ...") with open(DATA_DIR + "review.json", "r") as fin: for ln in fin: data = json.loads(ln) _bid = data['business_id'] if _bid not in business_profile: # only Lv, Tor, and Phx businesses continue users.append(data['user_id']) businesses.append(_bid) cities.append(business_profile[_bid]["city"]) timestamps.append(data['date']) interactions = pd.DataFrame({ 'user': users, 'business': businesses, "city": cities, "timestamp": timestamps }) interactions.to_csv(PREPROCESS_DIR + "user_business_interact.csv", index=False) # kept user for parse user user_remained = interactions["user"].unique().tolist() dump_pkl(PREPROCESS_DIR + "users_list.pkl", user_remained)
def gather_histogram_data(genre, mode='train'): """Driver function to collect frame features for a genre""" trainPath = os.path.join(video_resource, mode, genre) print trainPath videoPaths = glob(trainPath + '/*') genreFeatures = [] for videoPath in videoPaths: print videoPath, ":", frames = list(get_frames(videoPath, time_step=1000)) print len(frames), if len(frames) == 0: print "corrupt." continue videoFeatures = np.array([get_histogram(frame) for frame in frames]) print videoFeatures.shape genreFeatures.append(videoFeatures) outPath = genre + "_histogram_" + mode dump_pkl(genreFeatures, outPath)
def gather_training_data(genre, model_name=default_model_name): """Driver function to collect frame features for a genre""" trainPath = os.path.join(video_resource,'train',genre) print trainPath videoPaths = glob(trainPath+'/*') genreFeatures = [] for videoPath in videoPaths: print videoPath,":", frames =list(get_frames(videoPath, time_step=1000)) print len(frames), if len(frames)==0: print "corrupt." continue videoFeatures = get_features_batch(frames, model_name) print videoFeatures.shape genreFeatures.append(videoFeatures) outPath = genre+"_ultimate_"+model_name dump_pkl(genreFeatures, outPath)
def train_mode(genres=['action', 'drama', 'horror', 'romance']): trainingData, trainingLabels = [], [] for genreIndex, genre in enumerate(genres): try: genreFeatures = np.array(load_pkl(genre + '_histogram_train')) except Exception as e: print e return for videoFeatures in genreFeatures: for feature in videoFeatures: trainingData.append(feature) trainingLabels.append(genreIndex) trainingData = np.array(trainingData) trainingLabels = np.array(trainingLabels) print trainingData.shape print trainingLabels.shape print "Training..." model = RF(n_estimators=15, n_jobs=-1).fit(trainingData, trainingLabels) dump_pkl(model, "RF_histogram")
def get_features_data(mode='val'): """ Includes every sample with plotFeatures, videoFeatures, movie_id and genreLabel """ featureData = [] allData = load_pkl(baseName+mode) plots = [] """Process plot vectors""" for data in allData: movie_id = data['movie_id'] plot = data['plot'] plots.append(plot) if mode=='train': textObj = WordEmbeddings() plotFeatures_all = textObj.fit_transform(plots) dump_pkl(textObj, 'plot_object_train') else: try: textObj = load_pkl('plot_object_train') plotFeatures_all = textObj.transform(plots, reverse=False) except Exception as e: print e print "Please train the plots first." return plotIndex = -1 for data in allData: plotIndex += 1 movie_id = data['movie_id'] path = glob(video_resource+str(movie_id)+'.*')[0] plot = data['plot'] genreLabel = data['newGenreLabels'] print plotIndex,"out of ",len(allData) print "Gathering features for",movie_id videoFeatures = data['videoFeatures'] plotFeatures = plotFeatures_all[plotIndex] featureData.append({'videoFeatures':videoFeatures, 'plotFeatures':plotFeatures, 'movie_id':movie_id, 'genreLabel':genreLabel}) dump_pkl(featureData, 'feature_data_'+mode)
def process_record(cnn, data): if os.path.exists(os.path.join(FLAGS.feats_dir, data['feat_path'])): print "Already processed ... " return video_fullpath = os.path.join(FLAGS.videos_dir, data['video_path']) try: cap = cv2.VideoCapture(video_fullpath) except: ipdb.set_trace() frame_count = 0 frame_list = [] while True: # Capture frame-by-frame ret, frame = cap.read() if ret is False: break frame_list.append(frame) frame_count += 1 if frame_count == 0: return frame_list = np.array(frame_list) if frame_count > FLAGS.nr_frames: start = np.random.randint(0, frame_count - 10) frame_indices = np.arange(start, start + 10) frame_list = frame_list[frame_indices] cropped_frame_list = np.array( map(lambda x: cnn.preprocess_frame(FLAGS.cropping_sizes, x), frame_list)) feats = cnn.get_features(cropped_frame_list) save_full_path = os.path.join(FLAGS.feats_dir, data['feat_path']) dump_pkl(feats, save_full_path)
def parse_business(): """extract business information from business.json output: business.profile.pkl city.business.pkl """ city_business = {} # dictionary of city: [business list] business_profiles = {} # dictionary of business profile # count business by location (city and state) print( "\t[parse_business] preprocessing all business without selecting cities ..." ) with open(DATA_DIR + "business.json", "r") as fin: for ind, ln in enumerate(fin): data = json.loads(ln) city = data['city'] if city not in CANDIDATE_CITY: # only use cities continue business_id = data["business_id"] # removed fields: id, state, attributes, and hours # remained fields: fields: name, address, postal-code, latitude/longitude # star, review_count, is_open del data["business_id"], data["state"] del data["attributes"], data["hours"] business_profiles[business_id] = data # save business id to city_business dictionary city_business[city] = city_business.get(city, []) city_business[city].append(business_id) # save city business mapping print("\t[parse business] dumping business.profile and city.business ...") dump_pkl(PREPROCESS_DIR + "business_profile.pkl", business_profiles) dump_pkl(PREPROCESS_DIR + "city_business.pkl", city_business)
def train_from_scratch(config, state, channel): model_type = config.model # set up automatically some fields in config if config.dataset.signature == 'MNIST_binary_russ': config[model_type].n_in = 784 config[model_type].n_out = 784 # manipulate the 'state # save the config file save_model_path = config.save_model_path if save_model_path == 'current': config.save_model_path = './' # to facilitate the use of cluster for multiple jobs save_path = './model_config.pkl' else: # run locally, save locally save_path = save_model_path + 'model_config.pkl' utils.create_dir_if_not_exist(config.save_model_path) # for stdout file logging #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log') print 'saving model config into %s' % save_path utils.dump_pkl(config, save_path) # Also copy back from config into state. for key in config: setattr(state, key, config[key]) print 'Model Type: %s' % model_type print 'Host: %s' % socket.gethostname() print 'Command: %s' % ' '.join(sys.argv) print 'initializing data engine' input_dtype = 'float32' target_dtype = 'int32' data_engine = None deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine, channel)
def train_from_scratch(config, state, channel): model_type = config.model # set up automatically some fields in config if config.dataset.signature == 'MNIST_binary_russ': config[model_type].n_in = 784 config[model_type].n_out = 784 # manipulate the 'state # save the config file save_model_path = config.save_model_path if save_model_path == 'current': config.save_model_path = './' # to facilitate the use of cluster for multiple jobs save_path = './model_config.pkl' else: # run locally, save locally save_path = save_model_path + 'model_config.pkl' utils.create_dir_if_not_exist(config.save_model_path) # for stdout file logging #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log') print 'saving model config into %s'%save_path utils.dump_pkl(config, save_path) # Also copy back from config into state. for key in config: setattr(state, key, config[key]) print 'Model Type: %s'%model_type print 'Host: %s' % socket.gethostname() print 'Command: %s' % ' '.join(sys.argv) print 'initializing data engine' input_dtype = 'float32' target_dtype = 'int32' data_engine = None deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine, channel)
def gather_testing_data(genre, model_name=default_model_name): """Driver function to collect frame features for a genre""" testPath = os.path.join(video_resource, 'test', genre) print(testPath) videoPaths = glob(testPath + '/*') genreFeatures = [] for videoPath in videoPaths: print( videoPath, ":", ) frames = list(get_frames(videoPath, time_step=1000)) print(len(frames), ) if len(frames) == 0: print("corrupt.") continue videoFeatures = get_features_batch(frames) print(videoFeatures.shape) genreFeatures.append(videoFeatures) outPath = genre + "_test_" + model_name dump_pkl(genreFeatures, outPath)
if args.modality == "text": vocab_lim = "full" if args.vocab_lim is None else str( args.vocab_lim) vocab_fname = vocab_dict[str(vocab_lim)] assert exists(join(local_data_path, vocab_fname + ".p")), "build vocabulary first" vocab = load_pkl(local_data_path, vocab_fname) trainset, valset = build_text_dataset(meta_dict, vocab, args.data_split) train_caps_fname = "train_seq_" + str(args.vocab_lim) val_caps_fname = "val_seq_" + str(args.vocab_lim) dump_pkl(trainset, local_data_path, train_caps_fname) dump_pkl(valset, local_data_path, val_caps_fname) elif args.modality == "vis": build_vis_dataset(meta_dict) elif args.modality == "full": vocab_lim = "full" if args.vocab_lim is None else str( args.vocab_lim) vocab_fname = vocab_dict[str(vocab_lim)] assert exists(join(local_data_path, vocab_fname + ".p")), "build vocabulary first" vocab = load_pkl(local_data_path, vocab_fname) vis_dir = join(local_data_path, "crops")
def extract_user_attr(city): """extract user attributes Args: city - the city to profess Save to disk: df_nonzero - non zero number ratios of each attribute Return: df_nonzero - as above. """ print("\t[user] loading user interaction from {}...".format(city)) user_profile = load_pkl(INPUT_DIR + "{}/city_user_profile.pkl".format(city)) user_loc = load_pkl(INPUT_DIR + "{}/city_user_loc.pkl".format(city)) user_data_csv = [] user_data_pkl = {} # process users, NOTE: user new index starts with 1 for uid, prof_dict in user_profile.items(): # --- create feature area --- # after checking, each user has all attributes, review_count is non-zero tmp_entry = dict() u_elite = prof_dict.get('elite', []) tmp_entry['elite_count'] = len(u_elite) # user elite tmp_entry['review_count'] = prof_dict.get('review_count', CNT_DFL) # review_count tmp_entry['fans_count'] = prof_dict.get('fans', CNT_DFL) # fans tmp_entry['funny_score'] = prof_dict.get('funny', CNT_DFL) # funny tmp_entry['cool_score'] = prof_dict.get('cool', CNT_DFL) # cool tmp_entry['useful_score'] = prof_dict.get('useful', CNT_DFL) # useful tmp_entry['avg_stars'] = prof_dict.get('average_stars', STAR_DFL) # average stars tmp_entry['mean_lat'] = user_loc[uid]["mean_lat"] tmp_entry['mean_long'] = user_loc[uid]["mean_long"] reg_yelp_date = prof_dict.get('yelping_since', DATE_DFL) # yelping years delta_time = datetime.today() - parser.parse(reg_yelp_date) tmp_entry['yelping_years'] = delta_time.days // 365 # --- end create feature area --- user_data_csv.append(tmp_entry) user_data_pkl[uid] = tmp_entry # create data frame empty_head_entry = pd.DataFrame({'elite_count': 0, "review_count": CNT_DFL, "fans_count": CNT_DFL, "funny_score": CNT_DFL, "cool_score": CNT_DFL, "useful_score": CNT_DFL, "avg_stars": STAR_DFL, "yelping_years": 0, "mean_lat": CNT_DFL, "mean_long": CNT_DFL}, index=[0]) df_user_profile = pd.DataFrame(user_data_csv) assert empty_head_entry.shape[1] == df_user_profile.shape[1] df_user_profile = pd.concat([empty_head_entry, df_user_profile], axis=0, sort=True).reset_index(drop=True) print("\t[user] length of `df_user_profile` {}".format(df_user_profile.shape[0])) # non-zero count attributes df_nonzero = df_user_profile.fillna(0).astype(bool).sum(axis=0) df_nonzero = df_nonzero / len(df_user_profile) print("\t[user] non-zero terms in `df_user_profile`") print(df_nonzero) print("\t[user] saving dataframe to {}".format(OUTPUT_DIR)) df_user_profile.to_csv( OUTPUT_DIR+"{}/processed_city_user_profile.csv".format(city), index=False) dump_pkl(path=OUTPUT_DIR+"{}/processed_city_user_profile.pkl".format(city), obj=user_data_pkl) return df_nonzero
def discretize_field_attr(city): """Discretize continuous fields to Starting from 1 instead of 0 Args: Returns: c - city [DEPRECATED] num_bkt - the number of buckets for embedding continuous values >0 for a `num_bkt` number of buckets -1 for a total discretize, i.e., take integers as discrete values avg_stars,cool_score,elite_count,fans_count,funny_score,review_count, useful_score,yelping_years """ col_configs = load_configs(city) print("\t[user] discretize - loading user attrs") df = pd.read_csv(INPUT_DIR + "{}/processed_city_user_profile.csv".format(city)) cols_disc_info = dict() ft_idx_start = 1 distinct_df_col_names, distinct_df_cols = [], [] le = LabelEncoder() # create for transforming CAT features for col in df.columns: # treat attribute as discrete variable # if col in discrete_attrs: if col in col_configs['CATEGORICAL']: distinct_df_cols.append(pd.Series(le.fit_transform(df[col]))+ft_idx_start) distinct_df_col_names.append(col+"_d_dist") num_vals = len(le.classes_) vals_map_to = le.transform(le.classes_) + ft_idx_start vals_map = dict(zip(le.classes_, vals_map_to)) entry = {"bucket": False, "value_map": vals_map, "count": num_vals, "max_idx": max(vals_map_to), "min_idx": min(vals_map_to)} ft_idx_start += num_vals # treat attribute as continuous variable # else: elif col in col_configs['NUMERICAL']: num_bkt = col_configs.getint("NUMERICAL", col) max_val, min_val = df[col].max(), df[col].min() distinct_df_col_names.append(col + "_c_dist") distinct_df_cols.append( pd.cut(df[col], num_bkt, labels=range(ft_idx_start, ft_idx_start+num_bkt))) entry = {"bucket": True, "max_val": max_val, "min_val": min_val, "count": num_bkt, "min_idx": ft_idx_start, "max_idx": ft_idx_start + num_bkt - 1} ft_idx_start += num_bkt else: raise KeyError("{} is NOT configured in `columns_{}.ini`".format(col, city)) cols_disc_info[col] = entry df_disc = pd.DataFrame(data=dict(zip(distinct_df_col_names, distinct_df_cols))) print("\t[user] discretize - saving dist. attr. and info to {}".format(OUTPUT_DIR)) df_disc.to_csv(OUTPUT_DIR + "{}/processed_city_user_profile_dist.csv".format(city), index=False) dump_pkl(OUTPUT_DIR + "{}/cols_disc_info.pkl".format(city), cols_disc_info)
def parse_yelp(args): """draw review from `review.json` """ assert hasattr(args, "min_cat_num"), "Please set `min_cat_num` for yelp." assert hasattr(args, "k_core"), "Please set `k_core` for yelp." in_dir = "/local2/zyli/irs_fn/data/raw/yelp/" #in_dir = INPUT_DIR + "yelp/" out_dir = OUTPUT_DIR + "yelp/" print("[Yelp] processing yelp dataset ...") print("[Yelp] loading business ...") food_cats = load_yelp_categories() business_profiles = dict() with open(in_dir + "business.json", "r") as fin: for ind, ln in enumerate(fin): data = json.loads(ln) # entry must have below fields if not all([ bool(data[x]) for x in ['business_id', 'review_count', 'categories'] ]): continue if data['review_count'] < args.k_core: continue categories = [ x.strip().lower() for x in data['categories'].strip().split(", ") ] filter_cats_num = sum([x in food_cats for x in categories]) # throw away the business in two cases: # 1. all its cat(s) not present in food_cats # 2. >=1 cats not in, but >args.min_cat_num in food_cats if (not filter_cats_num) or \ (len(categories) != filter_cats_num and filter_cats_num < args.min_cat_num): continue bid = data['business_id'] business_profiles[bid] = { 'review_count': data['review_count'], 'categories': categories } print("[Yelp] loading reviews ...") bid2idx, uid2idx = dict(), dict() uniq_bids, uniq_uids = [], [] review_bids, review_uids = [], [] review_set = dict() with open(in_dir + "review.json", "r") as fin: for ln in fin: data = json.loads(ln) # Make sure all four domains are not `None`. if not all([ bool(data[x]) for x in ['business_id', 'user_id', 'stars', 'text'] ]): continue bid, uid = data['business_id'], data['user_id'] if bid not in business_profiles: continue if bid not in bid2idx: new_bid = "b_" + str(len(uniq_bids)) uniq_bids.append(bid) assert uniq_bids[-1] == uniq_bids[int(new_bid[2:])] bid2idx[bid] = new_bid else: new_bid = bid2idx[bid] if uid not in uid2idx: new_uid = "u_" + str(len(uniq_uids)) uniq_uids.append(uid) assert uniq_uids[-1] == uniq_uids[int(new_uid[2:])] uid2idx[uid] = new_uid else: new_uid = uid2idx[uid] review_bids.append(new_bid) review_uids.append(new_uid) # NOTE: new_uid and new_bid are `u_[user_idx]` and `b_[bus_idx]`. review_set[(new_uid, new_bid)] = { "user_id": new_uid, "item_id": new_bid, "rating": data['stars'], "review": data["text"] } assert len(review_bids) == len(review_uids) print("[Yelp] building k_core graph, k={} ...".format(args.k_core)) G = nx.Graph() G.add_edges_from(zip(review_uids, review_bids)) print("[Yelp]\t num of nodes [{}] and edges [{}] before k_core.".format( G.number_of_nodes(), G.number_of_edges())) G_kcore = nx.algorithms.core.k_core(G, k=args.k_core) # Check if all edges are "ub" or "bu" assert all([x[0]+y[0] in ["bu", "ub"] for x, y in G_kcore.edges()]),\ "NOT all edges are u-b or b-u!" # Unify edges from "u-b" or "b-u" to "u-b" to query `review_set` G_kcore_edges = [(x, y) if x[0] == "u" else (y, x) for x, y in G_kcore.edges()] kcore_dataset = [review_set[tp] for tp in G_kcore_edges] print("[Yelp]\t num of nodes [{}] and edges [{}] after k_core.".format( G_kcore.number_of_nodes(), G_kcore.number_of_edges())) # create a dataframe to save/view/... kcore_df = pd.DataFrame(kcore_dataset) print("[Yelp] \t number of unique users [{}] and businesses [{}]".format( kcore_df["user_id"].nunique(), kcore_df['item_id'].nunique())) print("[Yelp] \t unique ratings {}".format(kcore_df['rating'].unique())) print("[Yelp] dumping data and four ref pickles ...") make_dir(out_dir) kcore_df.to_csv(out_dir + "data.csv", index=False, columns=['user_id', 'item_id', 'rating', 'review']) dump_pkl(out_dir + "bid2idx.pkl", bid2idx) dump_pkl(out_dir + "uniq_bids.pkl", uniq_bids) dump_pkl(out_dir + "uid2idx.pkl", uid2idx) dump_pkl(out_dir + "uniq_uids.pkl", uniq_uids) print("[Yelp] preprocessing done, files saved to {}".format(out_dir)) train, test = train_test_split(kcore_df, test_size=test_split_ratio) train.to_csv('sTrainData.csv') test.to_csv('sTestData.csv')
def parse_goodreads(args): print("[Goodreads] processing yelp dataset ...") json_data = [] with open( 'goodreads.json', 'r' ) as handle: # replace "goodreads.json" with the path of your file for line in handle: json_data.append(json.loads(line)) data_raw = pd.DataFrame(json_data) data = data_raw.drop([ 'review_id', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments' ], axis=1) def normalizeNulls(a): a.replace('N/A', np.NaN, inplace=True) a.replace('Null', np.NaN, inplace=True) a.replace('NULL', np.NaN, inplace=True) a.replace('null', np.NaN, inplace=True) a.replace('', np.NaN, inplace=True) a.replace('None', np.NaN, inplace=True) a.replace('none', np.NaN, inplace=True) normalizeNulls(data) data.dropna(inplace=True) # dropping n/a or null values #the 2 blocks of code below assign a unique id to each book and author data['id'] = data.groupby(['user_id']).ngroup() for i in range(len(data.index)): data.loc[i, 'id'] = 'u_' + str(data.loc[i, 'id']) data['user_id'] = data['id'] data = data.drop('id', axis=1) data['id'] = data.groupby(['book_id']).ngroup() for i in range(len(data.index)): data.loc[i, 'id'] = 'b_' + str(data.loc[i, 'id']) data['book_id'] = data['id'] data = data.drop('id', axis=1) review_set = dict() for index, row in data.iterrows(): review_set[(row['user_id'], row['book_id'])] = { "user_id": row['user_id'], "book_id": row['book_id'], "rating": row['rating'], "review": row['review_text'] } #5-core has to be applied here G = nx.Graph() G.add_edges_from(zip(data['user_id'], data['book_id'])) #uids for authors, bids for books print( "[Goodreads]\t num of nodes [{}] and edges [{}] before k_core.".format( G.number_of_nodes(), G.number_of_edges())) G_kcore = nx.algorithms.core.k_core(G, k=5) # Check if all edges are "ub" or "bu" assert all([x[0]+y[0] in ["bu", "ub"] for x, y in G_kcore.edges()]),\ "NOT all edges are u-b or b-u!" # Unify edges from "u-b" or "b-u" to "u-b" to query `review_set` G_kcore_edges = [(x, y) if x[0] == "u" else (y, x) for x, y in G_kcore.edges()] kcore_dataset = [review_set[tp] for tp in G_kcore_edges] print( "[Goodreads]\t num of nodes [{}] and edges [{}] after k_core.".format( G_kcore.number_of_nodes(), G_kcore.number_of_edges())) # create a dataframe to save/view/... kcore_df = pd.DataFrame(kcore_dataset) print(kcore_df.head()) #our main data is now in 'kcore_df'˜ dataframe and not the 'data' dataframe def clean_text(unclean_text): clean_text = wc.clean_html(unclean_text) clean_text = wc.clean_str2(clean_text) stop_words = text.ENGLISH_STOP_WORDS clean_text = wc.remove_stopwords(clean_text, stop_words) clean_text = wc.lemmatized_string(clean_text) return clean_text cleanFunction = lambda x: clean_text(x) kcore_df['review'] = pd.DataFrame(data.review_text.apply(cleanFunction)) kcore_df.to_csv('cleanData.csv') def stem_text(unstemmed_text): stemmed_text = wc.stemmed_string(unstemmed_text) return stemmed_text stemFunction = lambda x: stem_text(x) kcore_df['review_text'] = pd.DataFrame( data.review_text.apply(stemFunction)) #count authors and books here book_count = kcore_df['book_id'].nunique() user_count = kcore_df['user_id'].nunique() num_reviews = (kcore_df.count())[3] #saving stats of the dataset in stats.json stats_dict = { 'Number of books': str(book_count), 'Number of users': str(user_count), 'Number of reviews': str(num_reviews) } with open('stats.json', 'w') as fp: json.dump(stats_dict, fp) kcore_df.to_csv('stemmedCleanData.csv') train, test = train_test_split(kcore_df, test_size=test_split_ratio) train.to_csv('sTrainData.csv') test.to_csv('sTestData.csv') uniq_bids = kcore_df['book_id'].unique() uniq_uids = kcore_df['user_id'].unique() dump_pkl(out_dir + "uniq_bids.pkl", uniq_bids) dump_pkl(out_dir + "uniq_uids.pkl", uniq_uids)