def msgpack_assertMeta(filename, frames=None, redo=False): '''Asserts that the .meta file for a given .msg file exists and returns the data in the .meta file once it exists''' meta_out_file = filename.replace(".msg", ".meta") print(meta_out_file) meta_frames = None if (os.path.exists(meta_out_file) and not redo): #Need to check for latin encodings due to weird pandas default try: meta_frames = pd.read_msgpack(meta_out_file) except UnicodeDecodeError as e: meta_frames = pd.read_msgpack(meta_out_file, encoding='latin-1') if (meta_frames == None): if (frames == None): print( "Bulk reading .msg for metaData assertion. Be patient, reading in slices not supported." ) print(filename) #Need to check for latin encodings due to weird pandas default try: frames = pd.read_msgpack(filename) except UnicodeDecodeError as e: frames = pd.read_msgpack(filename, encoding='latin-1') meta_frames = {"NumValues": frames["NumValues"]} if (not os.path.exists(meta_out_file) or redo): pd.to_msgpack(meta_out_file, meta_frames) return meta_frames
def merge_order_df(start='2016-11-01', end='2016-11-30', use_cache=True, remove_pool=False): """ Concatenate order dataframes for given dates """ if remove_pool: cache_path = os.path.join(CACHE_DIR, f'merged_orders_no_pool.msgpack') else: cache_path = os.path.join(CACHE_DIR, f'merged_orders.msgpack') if os.path.exists(cache_path) and use_cache: print(f'{cache_path} exists') orders = pd.read_msgpack(cache_path) else: df_new_list = [] date_str_list = get_date_list(start=start, end=end) for date in date_str_list: order = read_data('order', date=date, sample=1) df_new_list += [order.copy()] orders = pd.concat(df_new_list, sort=False) ################################## # Removing orders where the ride duration is greater than 180 minutes orders = orders[orders.ride_duration <= 180] orders.sort_values(['driver_id', 'ride_start_timestamp'], inplace=True) ################################## pd.to_msgpack(cache_path, orders) print(f'Dumping to {cache_path}') return orders
def predictProteins(self, recs, length=11, names=None, alleles=[], save=False, label='', path=''): """Get predictions for a set of proteins and/or over multiple alleles recs: a pandas DataFrame with cds data returns a dataframe of predictions over multiple proteins""" if type(alleles) is not types.ListType: alleles = [alleles] self.length = length recs = sequtils.getCDS(recs) if names != None: recs = recs[recs.locus_tag.isin(names)] proteins = list(recs.iterrows()) results=[] for i,row in proteins: st=time.time() seq = row['translation'] name = row['locus_tag'] #print name res = [] for a in alleles: #print a df = self.predict(sequence=seq,length=length, allele=a,name=name) if df is not None: res.append(df) res = pd.concat(res) if save == True: fname = os.path.join(path, name+'.mpk') pd.to_msgpack(fname, res) print 'predictions done for %s proteins' %len(proteins) return
def load_pandas(file_name='review.json', use_cache=True): cache_path = os.path.join(CACHE_PATH, f'load_pandas.msgpack') if use_cache and os.path.exists(cache_path): print(f'Loading from {cache_path}') ratings, user_counts, active_users = pd.read_msgpack(cache_path) print(f'Loaded from {cache_path}') else: line_count = len( open(os.path.join(EXCEL_PATH, file_name), encoding='utf8').readlines()) user_ids, business_ids, stars, dates, text = [], [], [], [], [] with open(os.path.join(EXCEL_PATH, file_name), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) user_ids += [blob["user_id"]] business_ids += [blob["business_id"]] stars += [blob["stars"]] dates += [blob["date"]] text += [blob["text"]] ratings = pd.DataFrame({ "user_id": user_ids, "business_id": business_ids, "rating": stars, "text": text, "date": dates }) user_counts = ratings["user_id"].value_counts() active_users = user_counts.loc[user_counts >= 5].index.tolist() pd.to_msgpack(cache_path, (ratings, user_counts, active_users)) print(f'Dumping to {cache_path}') return ratings, user_counts, active_users
def loadProject(self, filename=None, asksave=False): """Open project file""" w=True if asksave == True: w = self.closeProject() if w == None: return if filename == None: filename = filedialog.askopenfilename(defaultextension='.dexpl"', initialdir=os.getcwd(), filetypes=[("project","*.dexpl"), ("All files","*.*")], parent=self.main) if not filename: return if os.path.isfile(filename): #pb = self.progressDialog() #t = threading.Thread() #t.__init__(target=pd.read_msgpack, args=(filename)) #t.start() data = pd.read_msgpack(filename) #create backup file before we change anything backupfile = filename+'.bak' pd.to_msgpack(backupfile, data, encoding='utf-8') else: print ('no such file') data=None self.newProject(data) self.filename = filename self.main.title('%s - DataExplore' %filename) self.projopen = True return
def groupby_1_count(orders, use_cache=True, use_radial=False): """ Grouping to get number of rides of a driver in each time bin :param orders: Orders dataframe with time bin columns :param use_cache: Use previous cache or not :return: Grouped pandas dataframe """ if use_radial: group_col = 'pick_up_radial_zone' else: group_col = 'ride_start_timestamp_bin' cache_path = os.path.join(CACHE_DIR, f'groupby1.msgpack') if use_cache and os.path.exists(cache_path): temp1 = pd.read_msgpack(cache_path) print(f'Loading from {cache_path}') else: grouped_tmp = orders[[ 'driver_id', 'ride_start_timestamp_bin', 'order_id' ]].groupby(['driver_id', 'ride_start_timestamp_bin' ]).count() / orders[[ 'driver_id', 'ride_start_timestamp_bin', 'order_id' ]].groupby(['driver_id'])[['order_id']].count() temp1 = unstack_func(grouped_tmp) pd.to_msgpack(cache_path, temp1) print(f'Dumping to {cache_path}') return temp1
def get_spatial_features_hex(df, resolution=6, use_cache=True): print('Now creating spatial features') cache_path = os.path.join(CACHE_DIR, f'hex_spatial_df.msgpack') if os.path.exists(cache_path) and use_cache: print(f'{cache_path} exists') temp = pd.read_msgpack(cache_path) else: minlat = min(df.pickup_latitude) minlong = min(df.pickup_longitude) maxlat = max(df.pickup_latitude) maxlong = max(df.pickup_longitude) geoJson = { 'type': 'Polygon', 'coordinates': [[[minlat, minlong], [minlat, maxlong], [maxlat, maxlong], [maxlat, minlong]]] } hexagons = list(h3.polyfill(geoJson, resolution)) xy_pickup = utm.from_latlon(df.pickup_latitude.values, df.pickup_longitude.values) x_pickup = list(xy_pickup[0]) y_pickup = list(xy_pickup[1]) pickup_point = list(zip(x_pickup, y_pickup)) poly_hex = dict() for i, hex in enumerate(hexagons): polygons = h3.h3_set_to_multi_polygon([hex], geo_json=False) a = np.array(polygons[0][0]) b = utm.from_latlon(a[:, 0], a[:, 1]) poly_hex[i] = list(zip(b[0], b[1])) pick_zone = np.zeros(len(df)) - 1 for j, p in enumerate(pickup_point): point = Point(p) for i in range(len(poly_hex)): polygon = Polygon(poly_hex[i]) if polygon.contains(point): pick_zone[j] = int(i) break df['pickup_zone'] = pick_zone grouped_tmp = df[[ 'driver_id', 'pickup_zone', 'pickup_latitude' ]].groupby(['driver_id', 'pickup_zone']).count() / df[[ 'driver_id', 'pickup_zone', 'pickup_latitude' ]].groupby(['driver_id'])[['pickup_latitude']].count() temp = grouped_tmp.unstack(level=0).T temp.fillna(0, inplace=True) temp.reset_index(inplace=True) temp.drop(columns=['level_0'], inplace=True) pd.to_msgpack(cache_path, temp) print(f'Dumping to {cache_path}') return temp
def store(filepath, outputdir, rerun=False, storeType="hdf5"): filename = os.path.splitext(ntpath.basename(filepath))[0] if (storeType == "hdf5"): out_file = outputdir + filename + ".h5" print(out_file) store = pd.HDFStore(out_file) keys = store.keys() #print("KEYS:", set(keys)) #print("KEYS:", set(["/"+key for key in OBJECT_TYPES+["NumValues"]])) #print("KEYS:", set(keys)==set(["/"+key for key in OBJECT_TYPES+["NumValues"]])) if (set(keys) != set( ["/" + key for key in OBJECT_TYPES + ["NumValues"]]) or rerun): #print("OUT",out_file) try: frames = delphes_to_pandas(filepath) except Exception as e: print(e) print("Failed to parse file %r. File may be corrupted." % f) return 0 try: for key, frame in frames.items(): store.put(key, frame, format='table') except Exception as e: print(e) print("Failed to write to HDFStore %r" % out_file) return 0 num = len(store.get('NumValues').index) store.close() elif (storeType == "msgpack"): out_file = outputdir + filename + ".msg" # meta_out_file = outputdir + filename + ".meta" print(out_file) if (not os.path.exists(out_file) or rerun): try: frames = delphes_to_pandas(filepath) except Exception as e: print(e) print("Failed to parse file %r. File may be corrupted." % f) return 0 try: pd.to_msgpack(out_file, frames) except Exception as e: print(e) print("Failed to write msgpack %r" % out_file) return 0 # pd.to_msgpack(meta_out_file, meta_frames) meta_frames = msgpack_assertMeta(out_file, frames) else: meta_frames = msgpack_assertMeta(out_file) num = len(meta_frames["NumValues"].index) # elif(not os.path.exists(meta_out_file)): # print(".meta file missing creating %r" % meta_out_file) # frames = pd.read_msgpack(out_file) # meta_frames = {"NumValues" : frames["NumValues"]} # pd.to_msgpack(meta_out_file, meta_frames) else: raise ValueError("storeType %r not recognized" % storeType) return num
def doSaveProject(self, filename): """Save sheets as dict in msgpack""" data={} for i in self.sheets: data[i] = self.sheets[i].model.df pd.to_msgpack(filename, data, encoding='utf-8') return
def create_features(start='2016-11-01', end='2016-11-30', use_cache=True, save_file=True): """ Creates all features going into the linear model :param start: Start date of rides :param end: End date of rides :param use_cache: Use existing cache :return: Returns df with all features """ cache_path = os.path.join(CACHE_DIR, f'features_orders.msgpack') if os.path.exists(cache_path) and use_cache: print(f'{cache_path} exists') df_final = pd.read_msgpack(cache_path) else: orders = merge_order_df(start, end, use_cache=True) pool_rides(orders) get_start_end_bins(orders, ['ride_start_timestamp', 'ride_stop_timestamp']) # breakpoint() print('a') import time a = time.time() temp1 = groupby_1_count(orders, use_cache=True) temp2 = groupby_2_sum(orders, use_cache=True) print(time.time() - a) df_new = orders.groupby(['driver_id']).agg({ 'order_id': 'count', 'is_pool': 'sum' }).reset_index() print('c') df_new.rename( columns={ 'order_id': 'num_total_rides', 'is_pool': 'num_pool_rides' }, inplace=True) df_new['% of pool rides'] = ( df_new['num_pool_rides'] / df_new['num_total_rides']) print('d') print(f'Dumping to {cache_path}') df_final = pd.merge(df_new, temp1, on=['driver_id'], how='inner') # TODO check df_final = pd.merge(df_final, temp2, on=['driver_id'], how='inner', suffixes=('_count', '_sum')) if save_file: pd.to_msgpack(cache_path, df_final) return df_final
def save_msgpack(self, filename=None): """Save as msgpack format - experimental""" if filename == None: filename = 'epit_%s_%s_%s.msg' %(label,self.name,self.length) print ('saving as %s' %filename) meta = {'method':self.name, 'length':self.length} pd.to_msgpack(filename, meta) for i,g in self.data.groupby('name'): pd.to_msgpack(filename, g, append=True) return
def write_legacy_msgpack(output_dir): version = pandas.__version__ print("This script generates a storage file for the current arch, system, and python version") print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: msgpack") pth = '{0}.msgpack'.format(platform_name()) to_msgpack(os.path.join(output_dir, pth), create_msgpack_data()) print("created msgpack file: %s" % pth)
def doSaveProject(self, filename): """Save sheets as dict in msgpack""" self._checkTables() data = {} for i in self.sheets: table = self.sheets[i] data[i] = {} data[i]['table'] = table.model.df data[i]['meta'] = self.saveMeta(table) pd.to_msgpack(filename, data, encoding='utf-8') return
def _prepare_for_parallel(XY, serialize_flavor='feather'): """ Utility function to setup the data for parallel processing with low memory overhead. Parameters ---------- XY : pandas dataframe The combined independent variables/features and reponse/target variable. Returns ------- save_name : str The name of the temporary data. save_path : str The name of the temporary file path. save_ext : str The name of the temporary file extension. num_chunks : int The number of chunks inside pandas msgpck format (determined by dataframe size). serialize_flavor : str Which mode of downsaving data to use, currently supports 'feather' and 'msgpack'. Unfortunately, as of pandas 0.25.0, msgpack is no longer supported. Note: using feather requires pyarrow """ save_path = 'forward_tmp' if not os.path.isdir(save_path): os.mkdir(save_path) save_name = '/forward_run_' + str(int(mktime(datetime.now().timetuple()))) save_ext = '' if serialize_flavor == 'msgpack': save_ext = '.msg' num_chunks = int(((XY.memory_usage(index=True).sum()/(1024**3))/2) + 1) pd.to_msgpack(save_path + save_name + save_ext, { 'chunk_{0}'.format(i):chunk for i, chunk in enumerate(np.array_split(XY, num_chunks)) }) elif serialize_flavor == 'feather': save_ext = '.fth' num_chunks = None XY.to_feather(save_path + save_name + save_ext) else: raise ValueError("{0} is not a supported serialize method.".format(serialize_flavor)) return save_name, save_path, save_ext, num_chunks
def doSaveProject(self, filename): """Save sheets as dict in msgpack""" data={} for i in self.sheets: table = self.sheets[i] data[i] = {} data[i]['table'] = table.model.df data[i]['meta'] = self.saveMeta(table) #try: pd.to_msgpack(filename, data, encoding='utf-8') #except: # print('SAVE FAILED!!!') return
def segment_centroids(centr_df, segm_n, centr_segm_path): first_peak_df = centr_df[centr_df.peak_i == 0].copy() segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n)] segm_lower_bounds = list( np.quantile(first_peak_df.mz, q) for q in segm_bounds_q) segment_mapping = np.searchsorted( segm_lower_bounds, first_peak_df.mz.values, side='right') - 1 first_peak_df['segm_i'] = segment_mapping centr_segm_df = pd.merge(centr_df, first_peak_df[['formula_i', 'segm_i']], on='formula_i').sort_values('mz') for segm_i, df in centr_segm_df.groupby('segm_i'): pd.to_msgpack(f'{centr_segm_path}/centr_segm_{segm_i:04}.msgpack', df)
def copy_periodicity(project_id): remote = redis.StrictRedis('172.22.54.5') local = redis.StrictRedis('172.22.24.88') import pdb; pdb.set_trace() key = "{}_periodicity_heatmap".format(project_id) dt = pd.read_msgpack(remote.get(key)) local.set(key, pd.to_msgpack(dt))
def fastmsgpack_dumps(data, default=json_dumps): return pd.to_msgpack( None, data, compress=compress, default=default, encoding='latin1', )
def fastmsgpack_dumps(data): return pd.to_msgpack( None, data, compress=compress, default=json_dumps, encoding='latin1', )
def loadProject(self, filename=None, asksave=False): """Open project file""" w = True if asksave == True: w = self.closeProject() if w == None: return if filename == None: filename = filedialog.askopenfilename( defaultextension='.dexpl"', initialdir=self.defaultsavedir, filetypes=[("project", "*.dexpl"), ("All files", "*.*")], parent=self.main) if not filename: return if not os.path.exists(filename): print('no such file') self.removeRecent(filename) return ext = os.path.splitext(filename)[1] if ext != '.dexpl': print('does not appear to be a project file') return if os.path.isfile(filename): #pb = self.progressDialog() #t = threading.Thread() #t.__init__(target=pd.read_msgpack, args=(filename)) #t.start() data = pd.read_msgpack(filename) #create backup file before we change anything backupfile = filename + '.bak' pd.to_msgpack(backupfile, data, encoding='utf-8') else: print('no such file') self.quit() return self.newProject(data) self.filename = filename self.main.title('%s - DataExplore' % filename) self.projopen = True self.defaultsavedir = os.path.dirname(os.path.abspath(filename)) self.addRecent(filename) return
def predictSequences(self, data, seqkey='peptide', length=11, alleles=['HLA-DRB1*0101'], save=False): results=[] for i,row in data.iterrows(): seq = row[seqkey] if len(seq)<=length: continue #print i,seq res=[] for a in alleles: df = self.predict(sequence=seq,length=length, allele=a,name=i) res.append(df) res = pd.concat(res) results.append(res) if save==True: pd.to_msgpack('predictions_%s.mpk' %self.name, res, append=True) self.data = pd.concat(results) return results
def predictProteins(self, recs, names=None, save=False, label='', path='', **kwargs): """Get predictions for a set of proteins - no alleles so we override the base method for this too. """ recs = sequtils.getCDS(recs) if names != None: recs = recs[recs.locus_tag.isin(names)] proteins = list(recs.iterrows()) for i,row in proteins: seq = row['translation'] name = row['locus_tag'] #print name res = self.predict(sequence=seq,name=name) if save == True: fname = os.path.join(path, name+'.mpk') pd.to_msgpack(fname, res) return
def fastmsgpack_dumps(data, default=json_dumps): return pd.to_msgpack( None, data, compress=compress, default=default, encoding='utf-8', use_bin_type=True, )
def fastmsgpack_data_dumps(data): return { '__!bytes': pd.to_msgpack( None, data, compress=compress, default=fastmsgpack_default, encoding='latin1', ), }
def load_all(use_cache=True, override=False): """ Read in compressed files and cache in pandas readible files locally. Needed to be run on the first ever run. :param use_cache: :param override: Override directory check and run this function :return: """ # Check if files already exist in CACHE_DIR and warn user if not os.path.isdir(CACHE_DIR): os.mkdir(CACHE_DIR) if len(os.listdir(CACHE_DIR)) > 1 and not override: print( "Some files already exist in your CACHE_DIR. If you still want to run this function,\ run with override=True") return i = 1 for file in os.listdir(DATA_DIR): print(f'Processing {i} of 30 files') file_path = os.path.join(DATA_DIR, file) tar = tarfile.open(file_path, "r:gz") for member in tar.getmembers(): cache_path = os.path.join(CACHE_DIR, f'{member.name}.msgpack') print(member.name) if member.name.startswith('gps'): col_names = [ 'driver_id', 'order_id', 'timestamp', 'longitude', 'latitude' ] else: col_names = [ 'order_id', 'ride_start_timestamp', 'ride_stop_timestamp', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ] f = tar.extractfile(member) if f is not None: df = pd.read_csv(f, header=None, names=col_names) pd.to_msgpack(cache_path, df) i += 1
def fastmsgpack_data_dumps(data): return { '__!bytes': pd.to_msgpack( None, data, compress=compress, default=fastmsgpack_default, encoding='utf-8', use_bin_type=True, ), }
def groupby_2_sum(orders, use_cache=True): """ Grouping to get sum of ride durations of a driver in each time bin :param orders: Orders dataframe with time bin columns :param use_cache: Use previous cache or not :return: Grouped pandas dataframe """ cache_path = os.path.join(CACHE_DIR, f'groupby2.msgpack') if use_cache and os.path.exists(cache_path): temp2 = pd.read_msgpack(cache_path) print(f'Loading from {cache_path}') else: grouped_tmp_perc_active = orders[[ 'driver_id', 'ride_start_timestamp_bin', 'ride_duration' ]].groupby(['driver_id', 'ride_start_timestamp_bin' ])[['ride_duration']].sum() / orders[[ 'driver_id', 'ride_start_timestamp_bin', 'ride_duration' ]].groupby(['driver_id'])[['ride_duration']].sum() temp2 = unstack_func(grouped_tmp_perc_active) pd.to_msgpack(cache_path, temp2) print(f'Dumping to {cache_path}') return temp2
def get_spatial_features(df, grid_x_num=10, grid_y_num=10, use_cache=True): cache_path = os.path.join(CACHE_DIR, f'spatial_df.msgpack') if os.path.exists(cache_path) and use_cache: print(f'{cache_path} exists') temp = pd.read_msgpack(cache_path) else: pickup_coord = utm.from_latlon(df['pickup_latitude'].values, df['pickup_longitude'].values) col1, col2 = pickup_coord[0], pickup_coord[1] df['xpickup'] = col1 df['ypickup'] = col2 dropoff_coord = utm.from_latlon(df['dropoff_latitude'].values, df['dropoff_longitude'].values) col3, col4 = dropoff_coord[0], dropoff_coord[1] df['xdropoff'] = col3 df['ydropoff'] = col4 tempx = pd.cut(df['xpickup'], bins=grid_x_num).astype(str) tempy = pd.cut(df['ypickup'], bins=grid_y_num).astype(str) df['pick_up_zone'] = tempx + tempy tempx = pd.cut(df['xdropoff'], bins=grid_x_num).astype(str) tempy = pd.cut(df['ydropoff'], bins=grid_y_num).astype(str) df['drop_off_zone'] = tempx + tempy grouped_tmp = df[[ 'driver_id', 'pick_up_zone', 'pickup_latitude' ]].groupby(['driver_id', 'pick_up_zone']).count() / df[[ 'driver_id', 'pick_up_zone', 'pickup_latitude' ]].groupby(['driver_id'])[['pickup_latitude']].count() temp = grouped_tmp.unstack(level=0).T temp.fillna(0, inplace=True) temp.reset_index(inplace=True) temp.drop(columns=['level_0'], inplace=True) pd.to_msgpack(cache_path, temp) print(f'Dumping to {cache_path}') return temp
def get_spatial_features_radial(df, grid_x_num=10, grid_y_num=10, use_cache=True): cache_path = os.path.join(CACHE_DIR, f'radial_spatial_df.msgpack') if os.path.exists(cache_path) and use_cache: print(f'{cache_path} exists') temp = pd.read_msgpack(cache_path) else: cols = ['r_radial', 'theta_radial'] create_radial_bins(df, cols) grouped_tmp = df[[ 'driver_id', 'pick_up_radial_zone', 'pickup_latitude' ]].groupby(['driver_id', 'pick_up_radial_zone']).count() / df[[ 'driver_id', 'pick_up_radial_zone', 'pickup_latitude' ]].groupby(['driver_id'])[['pickup_latitude']].count() temp = grouped_tmp.unstack(level=0).T temp.fillna(0, inplace=True) temp.reset_index(inplace=True) temp.drop(columns=['level_0'], inplace=True) pd.to_msgpack(cache_path, temp) print(f'Dumping to {cache_path}') return temp
def save(self, label, singlefile=True): """Save all current predictions dataframe with some metadata""" if singlefile == True: fname = 'epit_%s_%s_%s.mpk' %(label,self.name,self.length) print 'saving as %s' %fname meta = {'method':self.name, 'length':self.length} pd.to_msgpack(fname, meta) for i,g in self.data.groupby('name'): pd.to_msgpack(fname, g, append=True) else: #save one file per protein/name path = os.path.join(label,self.name) print 'saving to %s' %path if not os.path.exists(path): os.makedirs(path) for name,df in self.data.groupby('name'): outfile = os.path.join(path, name+'.mpk') pd.to_msgpack(outfile,df) return
def location_and_time_data(file_name='location_time', use_cache=True): """ Returns location and time discounted rating """ cache_path = os.path.join(CACHE_PATH, f'location_and_time_data.msgpack') if use_cache and os.path.exists(cache_path): print(f'Loading from {cache_path}') detail_df = pd.read_msgpack(cache_path) print(f'Loaded from {cache_path}') else: # To get review json line_count = len( open(os.path.join(EXCEL_PATH, "review.json"), encoding='utf8').readlines()) user_ids, business_ids, stars, dates = [], [], [], [] with open(os.path.join(EXCEL_PATH, "review.json"), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) user_ids += [blob["user_id"]] business_ids += [blob["business_id"]] stars += [blob["stars"]] dates += [blob["date"]] ratings = pd.DataFrame({ "user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates }) line_count = len( open(os.path.join(EXCEL_PATH, "business.json"), encoding='utf8').readlines()) name, business_id, address, city, state, postal_code = [], [], [], [], [], [] latitude, longitude, stars, review_count = [], [], [], [] is_open, attributes, GoodForKids, categories, hours = [], [], [], [], [] with open(os.path.join(EXCEL_PATH, "business.json"), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) name += [blob["name"]] business_id += [blob["business_id"]] address += [blob["address"]] city += [blob["city"]] state += [blob["state"]] postal_code += [blob["postal_code"]] latitude += [blob["latitude"]] longitude += [blob["longitude"]] stars += [blob["stars"]] review_count += [blob["review_count"]] is_open += [blob["is_open"]] business = pd.DataFrame({ "name": name, "business_id": business_id, "address": address, "city": city, "state": state, "postal_code": postal_code, "latitude": latitude, "longitude": longitude, 'stars': stars, 'review_count': review_count, 'is_open': is_open }) detail_df = pd.merge(left=ratings, right=business, on='business_id', how='left') mean_lat = detail_df.groupby( 'user_id')['latitude'].mean().reset_index() mean_long = detail_df.groupby( 'user_id')['longitude'].mean().reset_index() mean_df = pd.merge(mean_lat, mean_long, on='user_id') mean_df.columns = ['user_id', 'mean_lat', 'mean_long'] detail_df = pd.merge(detail_df, mean_df, on='user_id', how='left') detail_df['distance'] = ( (detail_df['mean_lat'] - detail_df['latitude'])**2) + ( (detail_df['mean_long'] - detail_df['longitude'])**2) # For date distances detail_df['date'] = pd.to_datetime(detail_df['date']) last_date = detail_df.groupby('user_id')['date'].max().reset_index() last_date.columns = ['user_id', 'last_date'] detail_df = pd.merge(detail_df, last_date, on='user_id', how='left') # Months instead of days detail_df['date_diff'] = (detail_df['last_date'] - detail_df['date']) detail_df['date_diff'] = detail_df['date_diff'].dt.days / 30 # e(1/1+dist)/e # e(1/ log(date))/e detail_df['dist_scale'] = np.exp( 1 / (1 + detail_df['distance'])) / (np.exp(1)) detail_df['date_scale'] = np.exp( 1 / (1 + np.log(detail_df['date_diff'] + 1))) / np.exp(1) # Multiplying rating scale with the dist detail_df[ 'date_rating'] = detail_df['rating'] * detail_df['date_scale'] detail_df[ 'dist_rating'] = detail_df['rating'] * detail_df['dist_scale'] detail_df['date_dist_rating'] = detail_df['date_scale'] * detail_df[ 'dist_scale'] * detail_df['rating'] pd.to_msgpack(cache_path, (detail_df)) print(f'Dumping to {cache_path}') return detail_df
def all_data(file_name='all_data', use_cache=True): """ Returns business and user meta data with the ratings """ cache_path = os.path.join(CACHE_PATH, f'all_data.msgpack') if use_cache and os.path.exists(cache_path): print(f'Loading from {cache_path}') temp = pd.read_msgpack(cache_path) print(f'Loaded from {cache_path}') else: # To get review json line_count = len( open(os.path.join(EXCEL_PATH, "review.json"), encoding='utf8').readlines()) user_ids, business_ids, stars, dates = [], [], [], [] with open(os.path.join(EXCEL_PATH, "review.json"), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) user_ids += [blob["user_id"]] business_ids += [blob["business_id"]] stars += [blob["stars"]] dates += [blob["date"]] ratings = pd.DataFrame({ "user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates }) line_count = len( open(os.path.join(EXCEL_PATH, "business.json"), encoding='utf8').readlines()) name, business_id, address, city, state, postal_code = [], [], [], [], [], [] latitude, longitude, stars, review_count = [], [], [], [] is_open, attributes, GoodForKids, categories, hours = [], [], [], [], [] with open(os.path.join(EXCEL_PATH, "business.json"), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) name += [blob["name"]] business_id += [blob["business_id"]] address += [blob["address"]] city += [blob["city"]] state += [blob["state"]] postal_code += [blob["postal_code"]] latitude += [blob["latitude"]] longitude += [blob["longitude"]] stars += [blob["stars"]] review_count += [blob["review_count"]] is_open += [blob["is_open"]] business = pd.DataFrame({ "name": name, "business_id": business_id, "address": address, "city": city, "state": state, "postal_code": postal_code, "latitude": latitude, "longitude": longitude, 'stars': stars, 'review_count': review_count, 'is_open': is_open }) # To get user json line_count = len( open(os.path.join(EXCEL_PATH, "user.json"), encoding='utf8').readlines()) name, user_id, review_count, yelping_since, useful = [], [], [], [], [] funny, cool, elite, fans = [], [], [], [] average_stars, compliment_hot, compliment_more, compliment_profile = [], [], [], [] compliment_cute, compliment_list, compliment_note, compliment_plain, compliment_cool = [], [], [], [], [] compliment_funny, compliment_writer, compliment_photos = [], [], [] with open(os.path.join(EXCEL_PATH, "user.json"), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) name += [blob["name"]] user_id += [blob["user_id"]] review_count += [blob["review_count"]] yelping_since += [blob["yelping_since"]] useful += [blob["useful"]] funny += [blob["funny"]] cool += [blob["cool"]] elite += [blob["elite"]] fans += [blob["fans"]] average_stars += [blob["average_stars"]] compliment_hot += [blob["compliment_hot"]] compliment_more += [blob["compliment_more"]] compliment_profile += [blob["compliment_profile"]] compliment_cute += [blob["compliment_cute"]] compliment_list += [blob["compliment_list"]] compliment_note += [blob["compliment_note"]] compliment_plain += [blob["compliment_plain"]] compliment_cool += [blob["compliment_cool"]] compliment_funny += [blob["compliment_funny"]] compliment_writer += [blob["compliment_writer"]] compliment_photos += [blob["compliment_photos"]] user = pd.DataFrame({ "name": name, "user_id": user_id, "review_count": review_count, "yelping_since": yelping_since, "useful": useful, "funny": funny, "cool": cool, "elite": elite, "fans": fans, "average_stars": average_stars, "compliment_hot": compliment_hot, "compliment_more": compliment_more, "compliment_profile": compliment_profile, "compliment_cute": compliment_cute, "compliment_list": compliment_list, "compliment_note": compliment_note, "compliment_plain": compliment_plain, "compliment_cool": compliment_cool, "compliment_funny": compliment_funny, "compliment_writer": compliment_writer, "compliment_photos": compliment_photos }) # To get tip json line_count = len( open(os.path.join(EXCEL_PATH, "tip.json"), encoding='utf8').readlines()) business_id, user_id, text, date, compliment_count = [], [], [], [], [] with open(os.path.join(EXCEL_PATH, "tip.json"), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) business_id += [blob["business_id"]] user_id += [blob["user_id"]] text += [blob["text"]] date += [blob["date"]] compliment_count += [blob["compliment_count"]] tip = pd.DataFrame({ "business_id": business_id, "user_id": user_id, "text": text, "date": date, "compliment_count": compliment_count }) temp = pd.merge(ratings, business, on='business_id', how='left') temp = pd.merge(temp, user, on='user_id', how='left') temp = pd.merge(temp, tip, on=['business_id', 'user_id', 'date'], how='left') pd.to_msgpack(cache_path, (temp)) print(f'Dumping to {cache_path}') return temp