Exemplo n.º 1
0
def msgpack_assertMeta(filename, frames=None, redo=False):
    '''Asserts that the .meta file for a given .msg file exists and returns the data in the .meta file once it exists'''
    meta_out_file = filename.replace(".msg", ".meta")
    print(meta_out_file)
    meta_frames = None
    if (os.path.exists(meta_out_file) and not redo):
        #Need to check for latin encodings due to weird pandas default
        try:
            meta_frames = pd.read_msgpack(meta_out_file)
        except UnicodeDecodeError as e:
            meta_frames = pd.read_msgpack(meta_out_file, encoding='latin-1')
    if (meta_frames == None):
        if (frames == None):
            print(
                "Bulk reading .msg for metaData assertion. Be patient, reading in slices not supported."
            )
            print(filename)
            #Need to check for latin encodings due to weird pandas default
            try:
                frames = pd.read_msgpack(filename)
            except UnicodeDecodeError as e:
                frames = pd.read_msgpack(filename, encoding='latin-1')
        meta_frames = {"NumValues": frames["NumValues"]}

    if (not os.path.exists(meta_out_file) or redo):
        pd.to_msgpack(meta_out_file, meta_frames)

    return meta_frames
Exemplo n.º 2
0
def merge_order_df(start='2016-11-01', end='2016-11-30',
                   use_cache=True, remove_pool=False):
    """
    Concatenate order dataframes for given dates
    """
    if remove_pool:
        cache_path = os.path.join(CACHE_DIR, f'merged_orders_no_pool.msgpack')
    else:
        cache_path = os.path.join(CACHE_DIR, f'merged_orders.msgpack')
    if os.path.exists(cache_path) and use_cache:
        print(f'{cache_path} exists')
        orders = pd.read_msgpack(cache_path)
    else:

        df_new_list = []

        date_str_list = get_date_list(start=start, end=end)

        for date in date_str_list:
            order = read_data('order', date=date, sample=1)
            df_new_list += [order.copy()]

        orders = pd.concat(df_new_list, sort=False)
        ##################################
        # Removing orders where the ride duration is greater than 180 minutes
        orders = orders[orders.ride_duration <= 180]
        orders.sort_values(['driver_id', 'ride_start_timestamp'], inplace=True)
        ##################################
        pd.to_msgpack(cache_path, orders)
        print(f'Dumping to {cache_path}')
    return orders
Exemplo n.º 3
0
    def predictProteins(self, recs, length=11, names=None,
                         alleles=[], save=False, label='', path=''):
        """Get predictions for a set of proteins and/or over multiple alleles
          recs: a pandas DataFrame with cds data
          returns a dataframe of predictions over multiple proteins"""

        if type(alleles) is not types.ListType:
            alleles = [alleles]
        self.length = length
        recs = sequtils.getCDS(recs)
        if names != None:
            recs = recs[recs.locus_tag.isin(names)]
        proteins = list(recs.iterrows())
        results=[]
        for i,row in proteins:
            st=time.time()
            seq = row['translation']
            name = row['locus_tag']
            #print name
            res = []
            for a in alleles:
                #print a
                df = self.predict(sequence=seq,length=length,
                                    allele=a,name=name)
                if df is not None:
                    res.append(df)
            res = pd.concat(res)
            if save == True:
                fname = os.path.join(path, name+'.mpk')
                pd.to_msgpack(fname, res)
        print 'predictions done for %s proteins' %len(proteins)
        return
def load_pandas(file_name='review.json', use_cache=True):
    cache_path = os.path.join(CACHE_PATH, f'load_pandas.msgpack')
    if use_cache and os.path.exists(cache_path):
        print(f'Loading from {cache_path}')
        ratings, user_counts, active_users = pd.read_msgpack(cache_path)
        print(f'Loaded from {cache_path}')
    else:
        line_count = len(
            open(os.path.join(EXCEL_PATH, file_name),
                 encoding='utf8').readlines())
        user_ids, business_ids, stars, dates, text = [], [], [], [], []
        with open(os.path.join(EXCEL_PATH, file_name), encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                user_ids += [blob["user_id"]]
                business_ids += [blob["business_id"]]
                stars += [blob["stars"]]
                dates += [blob["date"]]
                text += [blob["text"]]

        ratings = pd.DataFrame({
            "user_id": user_ids,
            "business_id": business_ids,
            "rating": stars,
            "text": text,
            "date": dates
        })
        user_counts = ratings["user_id"].value_counts()
        active_users = user_counts.loc[user_counts >= 5].index.tolist()

        pd.to_msgpack(cache_path, (ratings, user_counts, active_users))
        print(f'Dumping to {cache_path}')
    return ratings, user_counts, active_users
Exemplo n.º 5
0
    def loadProject(self, filename=None, asksave=False):
        """Open project file"""

        w=True
        if asksave == True:
            w = self.closeProject()
        if w == None:
            return
        if filename == None:
            filename = filedialog.askopenfilename(defaultextension='.dexpl"',
                                                    initialdir=os.getcwd(),
                                                    filetypes=[("project","*.dexpl"),
                                                               ("All files","*.*")],
                                                    parent=self.main)
        if not filename:
            return
        if os.path.isfile(filename):
            #pb = self.progressDialog()
            #t = threading.Thread()
            #t.__init__(target=pd.read_msgpack, args=(filename))
            #t.start()
            data = pd.read_msgpack(filename)
            #create backup file before we change anything
            backupfile = filename+'.bak'
            pd.to_msgpack(backupfile, data, encoding='utf-8')
        else:
            print ('no such file')
            data=None
        self.newProject(data)
        self.filename = filename
        self.main.title('%s - DataExplore' %filename)
        self.projopen = True
        return
Exemplo n.º 6
0
def groupby_1_count(orders, use_cache=True, use_radial=False):
    """
    Grouping to get number of rides of a driver in each time bin
    :param orders: Orders dataframe with time bin columns
    :param use_cache: Use previous cache or not
    :return: Grouped pandas dataframe
    """
    if use_radial:
        group_col = 'pick_up_radial_zone'
    else:
        group_col = 'ride_start_timestamp_bin'

    cache_path = os.path.join(CACHE_DIR, f'groupby1.msgpack')
    if use_cache and os.path.exists(cache_path):
        temp1 = pd.read_msgpack(cache_path)
        print(f'Loading from {cache_path}')
    else:
        grouped_tmp = orders[[
            'driver_id', 'ride_start_timestamp_bin', 'order_id'
        ]].groupby(['driver_id', 'ride_start_timestamp_bin'
                    ]).count() / orders[[
                        'driver_id', 'ride_start_timestamp_bin', 'order_id'
                    ]].groupby(['driver_id'])[['order_id']].count()
        temp1 = unstack_func(grouped_tmp)
        pd.to_msgpack(cache_path, temp1)
        print(f'Dumping to {cache_path}')
    return temp1
Exemplo n.º 7
0
def get_spatial_features_hex(df, resolution=6, use_cache=True):

    print('Now creating spatial features')
    cache_path = os.path.join(CACHE_DIR, f'hex_spatial_df.msgpack')
    if os.path.exists(cache_path) and use_cache:
        print(f'{cache_path} exists')
        temp = pd.read_msgpack(cache_path)
    else:
        minlat = min(df.pickup_latitude)
        minlong = min(df.pickup_longitude)
        maxlat = max(df.pickup_latitude)
        maxlong = max(df.pickup_longitude)
        geoJson = {
            'type':
            'Polygon',
            'coordinates': [[[minlat, minlong], [minlat, maxlong],
                             [maxlat, maxlong], [maxlat, minlong]]]
        }

        hexagons = list(h3.polyfill(geoJson, resolution))

        xy_pickup = utm.from_latlon(df.pickup_latitude.values,
                                    df.pickup_longitude.values)
        x_pickup = list(xy_pickup[0])
        y_pickup = list(xy_pickup[1])
        pickup_point = list(zip(x_pickup, y_pickup))

        poly_hex = dict()
        for i, hex in enumerate(hexagons):
            polygons = h3.h3_set_to_multi_polygon([hex], geo_json=False)
            a = np.array(polygons[0][0])
            b = utm.from_latlon(a[:, 0], a[:, 1])
            poly_hex[i] = list(zip(b[0], b[1]))

        pick_zone = np.zeros(len(df)) - 1
        for j, p in enumerate(pickup_point):
            point = Point(p)
            for i in range(len(poly_hex)):
                polygon = Polygon(poly_hex[i])
                if polygon.contains(point):
                    pick_zone[j] = int(i)
                    break

        df['pickup_zone'] = pick_zone

        grouped_tmp = df[[
            'driver_id', 'pickup_zone', 'pickup_latitude'
        ]].groupby(['driver_id', 'pickup_zone']).count() / df[[
            'driver_id', 'pickup_zone', 'pickup_latitude'
        ]].groupby(['driver_id'])[['pickup_latitude']].count()

        temp = grouped_tmp.unstack(level=0).T
        temp.fillna(0, inplace=True)
        temp.reset_index(inplace=True)
        temp.drop(columns=['level_0'], inplace=True)
        pd.to_msgpack(cache_path, temp)
        print(f'Dumping to {cache_path}')

    return temp
Exemplo n.º 8
0
def store(filepath, outputdir, rerun=False, storeType="hdf5"):
    filename = os.path.splitext(ntpath.basename(filepath))[0]
    if (storeType == "hdf5"):
        out_file = outputdir + filename + ".h5"
        print(out_file)
        store = pd.HDFStore(out_file)
        keys = store.keys()
        #print("KEYS:", set(keys))
        #print("KEYS:", set(["/"+key for key in OBJECT_TYPES+["NumValues"]]))
        #print("KEYS:", set(keys)==set(["/"+key for key in OBJECT_TYPES+["NumValues"]]))
        if (set(keys) != set(
            ["/" + key for key in OBJECT_TYPES + ["NumValues"]]) or rerun):
            #print("OUT",out_file)
            try:
                frames = delphes_to_pandas(filepath)
            except Exception as e:
                print(e)
                print("Failed to parse file %r. File may be corrupted." % f)
                return 0
            try:
                for key, frame in frames.items():
                    store.put(key, frame, format='table')
            except Exception as e:
                print(e)
                print("Failed to write to HDFStore %r" % out_file)
                return 0
        num = len(store.get('NumValues').index)
        store.close()
    elif (storeType == "msgpack"):
        out_file = outputdir + filename + ".msg"
        # meta_out_file = outputdir + filename + ".meta"
        print(out_file)
        if (not os.path.exists(out_file) or rerun):
            try:
                frames = delphes_to_pandas(filepath)
            except Exception as e:
                print(e)
                print("Failed to parse file %r. File may be corrupted." % f)
                return 0
            try:
                pd.to_msgpack(out_file, frames)
            except Exception as e:
                print(e)
                print("Failed to write msgpack %r" % out_file)
                return 0
            # pd.to_msgpack(meta_out_file, meta_frames)
            meta_frames = msgpack_assertMeta(out_file, frames)
        else:
            meta_frames = msgpack_assertMeta(out_file)

        num = len(meta_frames["NumValues"].index)
        # elif(not os.path.exists(meta_out_file)):
        #     print(".meta file missing creating %r" % meta_out_file)
        #     frames = pd.read_msgpack(out_file)
        #     meta_frames = {"NumValues" : frames["NumValues"]}
        #     pd.to_msgpack(meta_out_file, meta_frames)
    else:
        raise ValueError("storeType %r not recognized" % storeType)
    return num
Exemplo n.º 9
0
    def doSaveProject(self, filename):
        """Save sheets as dict in msgpack"""

        data={}
        for i in self.sheets:
            data[i] = self.sheets[i].model.df

        pd.to_msgpack(filename, data, encoding='utf-8')
        return
Exemplo n.º 10
0
def create_features(start='2016-11-01', end='2016-11-30', use_cache=True, save_file=True):
    """
    Creates all features going into the linear model
    :param start: Start date of rides
    :param end: End date of rides
    :param use_cache: Use existing cache
    :return: Returns df with all features
    """

    cache_path = os.path.join(CACHE_DIR, f'features_orders.msgpack')
    if os.path.exists(cache_path) and use_cache:
        print(f'{cache_path} exists')
        df_final = pd.read_msgpack(cache_path)
    else:
        orders = merge_order_df(start, end, use_cache=True)
        pool_rides(orders)
        get_start_end_bins(orders,
                           ['ride_start_timestamp', 'ride_stop_timestamp'])

        #        breakpoint()

        print('a')
        import time
        a = time.time()
        temp1 = groupby_1_count(orders, use_cache=True)

        temp2 = groupby_2_sum(orders, use_cache=True)

        print(time.time() - a)

        df_new = orders.groupby(['driver_id']).agg({
            'order_id': 'count',
            'is_pool': 'sum'
        }).reset_index()

        print('c')
        df_new.rename(
            columns={
                'order_id': 'num_total_rides',
                'is_pool': 'num_pool_rides'
            },
            inplace=True)

        df_new['% of pool rides'] = (
            df_new['num_pool_rides'] / df_new['num_total_rides'])
        print('d')

        print(f'Dumping to {cache_path}')

        df_final = pd.merge(df_new, temp1, on=['driver_id'], how='inner')

        # TODO check

        df_final = pd.merge(df_final, temp2, on=['driver_id'], how='inner', suffixes=('_count', '_sum'))
        if save_file:
            pd.to_msgpack(cache_path, df_final)
    return df_final
Exemplo n.º 11
0
    def save_msgpack(self, filename=None):
        """Save as msgpack format - experimental"""

        if filename == None:
            filename = 'epit_%s_%s_%s.msg' %(label,self.name,self.length)
        print ('saving as %s' %filename)
        meta = {'method':self.name, 'length':self.length}
        pd.to_msgpack(filename, meta)
        for i,g in self.data.groupby('name'):
            pd.to_msgpack(filename, g, append=True)
        return
def write_legacy_msgpack(output_dir):

    version = pandas.__version__

    print("This script generates a storage file for the current arch, system, and python version")
    print("  pandas version: {0}".format(version))
    print("  output dir    : {0}".format(output_dir))
    print("  storage format: msgpack")

    pth = '{0}.msgpack'.format(platform_name())
    to_msgpack(os.path.join(output_dir, pth), create_msgpack_data())

    print("created msgpack file: %s" % pth)
Exemplo n.º 13
0
    def doSaveProject(self, filename):
        """Save sheets as dict in msgpack"""

        self._checkTables()
        data = {}
        for i in self.sheets:
            table = self.sheets[i]
            data[i] = {}
            data[i]['table'] = table.model.df
            data[i]['meta'] = self.saveMeta(table)

        pd.to_msgpack(filename, data, encoding='utf-8')
        return
def write_legacy_msgpack(output_dir):

    version = pandas.__version__

    print("This script generates a storage file for the current arch, system, and python version")
    print("  pandas version: {0}".format(version))
    print("  output dir    : {0}".format(output_dir))
    print("  storage format: msgpack")

    pth = '{0}.msgpack'.format(platform_name())
    to_msgpack(os.path.join(output_dir, pth), create_msgpack_data())

    print("created msgpack file: %s" % pth)
Exemplo n.º 15
0
def _prepare_for_parallel(XY, serialize_flavor='feather'):
    """
    Utility function to setup the data for parallel processing with low
    memory overhead.

    Parameters
    ----------
    XY : pandas dataframe
        The combined independent variables/features and reponse/target
        variable.

    Returns
    -------
    save_name : str
        The name of the temporary data.
    save_path : str
        The name of the temporary file path.
    save_ext : str
        The name of the temporary file extension.
    num_chunks : int
        The number of chunks inside pandas msgpck format (determined by 
        dataframe size).
    serialize_flavor : str
        Which mode of downsaving data to use, currently supports 'feather' and
        'msgpack'. Unfortunately, as of pandas 0.25.0, msgpack is no longer 
        supported.
        
        Note: using feather requires pyarrow

    """
        
    save_path = 'forward_tmp'
    if not os.path.isdir(save_path):
        os.mkdir(save_path)
    save_name = '/forward_run_' + str(int(mktime(datetime.now().timetuple())))
    save_ext = ''
    
    if serialize_flavor == 'msgpack':
        save_ext = '.msg'
        num_chunks = int(((XY.memory_usage(index=True).sum()/(1024**3))/2) + 1)
        pd.to_msgpack(save_path + save_name + save_ext, {
            'chunk_{0}'.format(i):chunk for i, chunk in enumerate(np.array_split(XY, num_chunks))
            })
    elif serialize_flavor == 'feather':
        save_ext = '.fth'
        num_chunks = None
        XY.to_feather(save_path + save_name + save_ext)
    else:
        raise ValueError("{0} is not a supported serialize method.".format(serialize_flavor))
    
    return save_name, save_path, save_ext, num_chunks
Exemplo n.º 16
0
    def doSaveProject(self, filename):
        """Save sheets as dict in msgpack"""

        data={}
        for i in self.sheets:
            table = self.sheets[i]
            data[i] = {}
            data[i]['table'] = table.model.df
            data[i]['meta'] = self.saveMeta(table)
        #try:
        pd.to_msgpack(filename, data, encoding='utf-8')
        #except:
        #    print('SAVE FAILED!!!')
        return
Exemplo n.º 17
0
    def doSaveProject(self, filename):
        """Save sheets as dict in msgpack"""

        data={}
        for i in self.sheets:
            table = self.sheets[i]
            data[i] = {}
            data[i]['table'] = table.model.df
            data[i]['meta'] = self.saveMeta(table)
        #try:
        pd.to_msgpack(filename, data, encoding='utf-8')
        #except:
        #    print('SAVE FAILED!!!')
        return
def segment_centroids(centr_df, segm_n, centr_segm_path):
    first_peak_df = centr_df[centr_df.peak_i == 0].copy()
    segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n)]
    segm_lower_bounds = list(
        np.quantile(first_peak_df.mz, q) for q in segm_bounds_q)

    segment_mapping = np.searchsorted(
        segm_lower_bounds, first_peak_df.mz.values, side='right') - 1
    first_peak_df['segm_i'] = segment_mapping

    centr_segm_df = pd.merge(centr_df,
                             first_peak_df[['formula_i', 'segm_i']],
                             on='formula_i').sort_values('mz')
    for segm_i, df in centr_segm_df.groupby('segm_i'):
        pd.to_msgpack(f'{centr_segm_path}/centr_segm_{segm_i:04}.msgpack', df)
def copy_periodicity(project_id):
    remote = redis.StrictRedis('172.22.54.5')
    local = redis.StrictRedis('172.22.24.88')
    import pdb; pdb.set_trace()
    key = "{}_periodicity_heatmap".format(project_id)
    dt = pd.read_msgpack(remote.get(key))
    local.set(key, pd.to_msgpack(dt))
Exemplo n.º 20
0
def fastmsgpack_dumps(data, default=json_dumps):
    return pd.to_msgpack(
        None,
        data,
        compress=compress,
        default=default,
        encoding='latin1',
    )
Exemplo n.º 21
0
def fastmsgpack_dumps(data):
    return pd.to_msgpack(
        None,
        data,
        compress=compress,
        default=json_dumps,
        encoding='latin1',
    )
Exemplo n.º 22
0
    def loadProject(self, filename=None, asksave=False):
        """Open project file"""

        w = True
        if asksave == True:
            w = self.closeProject()
        if w == None:
            return

        if filename == None:
            filename = filedialog.askopenfilename(
                defaultextension='.dexpl"',
                initialdir=self.defaultsavedir,
                filetypes=[("project", "*.dexpl"), ("All files", "*.*")],
                parent=self.main)
        if not filename:
            return
        if not os.path.exists(filename):
            print('no such file')
            self.removeRecent(filename)
            return
        ext = os.path.splitext(filename)[1]
        if ext != '.dexpl':
            print('does not appear to be a project file')
            return
        if os.path.isfile(filename):
            #pb = self.progressDialog()
            #t = threading.Thread()
            #t.__init__(target=pd.read_msgpack, args=(filename))
            #t.start()
            data = pd.read_msgpack(filename)
            #create backup file before we change anything
            backupfile = filename + '.bak'
            pd.to_msgpack(backupfile, data, encoding='utf-8')
        else:
            print('no such file')
            self.quit()
            return
        self.newProject(data)
        self.filename = filename
        self.main.title('%s - DataExplore' % filename)
        self.projopen = True
        self.defaultsavedir = os.path.dirname(os.path.abspath(filename))
        self.addRecent(filename)
        return
Exemplo n.º 23
0
 def predictSequences(self, data, seqkey='peptide', length=11,
                     alleles=['HLA-DRB1*0101'], save=False):
     results=[]
     for i,row in data.iterrows():
         seq = row[seqkey]
         if len(seq)<=length: continue
         #print i,seq
         res=[]
         for a in alleles:
            df = self.predict(sequence=seq,length=length,
                                 allele=a,name=i)
            res.append(df)
         res = pd.concat(res)
         results.append(res)
         if save==True:
             pd.to_msgpack('predictions_%s.mpk' %self.name, res, append=True)
     self.data = pd.concat(results)
     return results
Exemplo n.º 24
0
    def predictProteins(self, recs, names=None, save=False,
                        label='', path='', **kwargs):
        """Get predictions for a set of proteins - no alleles so we override
        the base method for this too. """

        recs = sequtils.getCDS(recs)
        if names != None:
            recs = recs[recs.locus_tag.isin(names)]
        proteins = list(recs.iterrows())
        for i,row in proteins:
            seq = row['translation']
            name = row['locus_tag']
            #print name
            res = self.predict(sequence=seq,name=name)
            if save == True:
                fname = os.path.join(path, name+'.mpk')
                pd.to_msgpack(fname, res)
        return
Exemplo n.º 25
0
def fastmsgpack_dumps(data, default=json_dumps):
    return pd.to_msgpack(
        None,
        data,
        compress=compress,
        default=default,
        encoding='utf-8',
        use_bin_type=True,
    )
Exemplo n.º 26
0
def fastmsgpack_data_dumps(data):
    return {
        '__!bytes': pd.to_msgpack(
            None,
            data,
            compress=compress,
            default=fastmsgpack_default,
            encoding='latin1',
        ),
    }
Exemplo n.º 27
0
def fastmsgpack_data_dumps(data):
    return {
        '__!bytes': pd.to_msgpack(
            None,
            data,
            compress=compress,
            default=fastmsgpack_default,
            encoding='latin1',
        ),
    }
Exemplo n.º 28
0
def load_all(use_cache=True, override=False):
    """
    Read in compressed files and cache in pandas readible files locally.
    Needed to be run on the first ever run.
    :param use_cache:
    :param override: Override directory check and run this function
    :return:
    """
    # Check if files already exist in CACHE_DIR and warn user
    if not os.path.isdir(CACHE_DIR):
        os.mkdir(CACHE_DIR)

    if len(os.listdir(CACHE_DIR)) > 1 and not override:
        print(
            "Some files already exist in your CACHE_DIR. If you still want to run this function,\
              run with override=True")
        return

    i = 1
    for file in os.listdir(DATA_DIR):
        print(f'Processing {i} of 30 files')
        file_path = os.path.join(DATA_DIR, file)
        tar = tarfile.open(file_path, "r:gz")
        for member in tar.getmembers():
            cache_path = os.path.join(CACHE_DIR, f'{member.name}.msgpack')
            print(member.name)
            if member.name.startswith('gps'):
                col_names = [
                    'driver_id', 'order_id', 'timestamp', 'longitude',
                    'latitude'
                ]
            else:
                col_names = [
                    'order_id', 'ride_start_timestamp', 'ride_stop_timestamp',
                    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                    'dropoff_latitude'
                ]
            f = tar.extractfile(member)
            if f is not None:
                df = pd.read_csv(f, header=None, names=col_names)
                pd.to_msgpack(cache_path, df)
        i += 1
Exemplo n.º 29
0
def fastmsgpack_data_dumps(data):
    return {
        '__!bytes': pd.to_msgpack(
            None,
            data,
            compress=compress,
            default=fastmsgpack_default,
            encoding='utf-8',
            use_bin_type=True,
        ),
    }
Exemplo n.º 30
0
def groupby_2_sum(orders, use_cache=True):
    """
    Grouping to get sum of ride durations of a driver in each time bin
    :param orders: Orders dataframe with time bin columns
    :param use_cache: Use previous cache or not
    :return: Grouped pandas dataframe
    """
    cache_path = os.path.join(CACHE_DIR, f'groupby2.msgpack')
    if use_cache and os.path.exists(cache_path):
        temp2 = pd.read_msgpack(cache_path)
        print(f'Loading from {cache_path}')
    else:
        grouped_tmp_perc_active = orders[[
            'driver_id', 'ride_start_timestamp_bin', 'ride_duration'
        ]].groupby(['driver_id', 'ride_start_timestamp_bin'
                    ])[['ride_duration']].sum() / orders[[
                        'driver_id', 'ride_start_timestamp_bin', 'ride_duration'
                    ]].groupby(['driver_id'])[['ride_duration']].sum()
        temp2 = unstack_func(grouped_tmp_perc_active)
        pd.to_msgpack(cache_path, temp2)
        print(f'Dumping to {cache_path}')
    return temp2
Exemplo n.º 31
0
def get_spatial_features(df, grid_x_num=10, grid_y_num=10, use_cache=True):
    cache_path = os.path.join(CACHE_DIR, f'spatial_df.msgpack')
    if os.path.exists(cache_path) and use_cache:
        print(f'{cache_path} exists')
        temp = pd.read_msgpack(cache_path)
    else:
        pickup_coord = utm.from_latlon(df['pickup_latitude'].values,
                                       df['pickup_longitude'].values)
        col1, col2 = pickup_coord[0], pickup_coord[1]
        df['xpickup'] = col1
        df['ypickup'] = col2

        dropoff_coord = utm.from_latlon(df['dropoff_latitude'].values,
                                        df['dropoff_longitude'].values)
        col3, col4 = dropoff_coord[0], dropoff_coord[1]
        df['xdropoff'] = col3
        df['ydropoff'] = col4

        tempx = pd.cut(df['xpickup'], bins=grid_x_num).astype(str)
        tempy = pd.cut(df['ypickup'], bins=grid_y_num).astype(str)
        df['pick_up_zone'] = tempx + tempy

        tempx = pd.cut(df['xdropoff'], bins=grid_x_num).astype(str)
        tempy = pd.cut(df['ydropoff'], bins=grid_y_num).astype(str)
        df['drop_off_zone'] = tempx + tempy

        grouped_tmp = df[[
            'driver_id', 'pick_up_zone', 'pickup_latitude'
        ]].groupby(['driver_id', 'pick_up_zone']).count() / df[[
            'driver_id', 'pick_up_zone', 'pickup_latitude'
        ]].groupby(['driver_id'])[['pickup_latitude']].count()
        temp = grouped_tmp.unstack(level=0).T
        temp.fillna(0, inplace=True)
        temp.reset_index(inplace=True)
        temp.drop(columns=['level_0'], inplace=True)
        pd.to_msgpack(cache_path, temp)
        print(f'Dumping to {cache_path}')
    return temp
Exemplo n.º 32
0
def get_spatial_features_radial(df,
                                grid_x_num=10,
                                grid_y_num=10,
                                use_cache=True):
    cache_path = os.path.join(CACHE_DIR, f'radial_spatial_df.msgpack')
    if os.path.exists(cache_path) and use_cache:
        print(f'{cache_path} exists')
        temp = pd.read_msgpack(cache_path)
    else:
        cols = ['r_radial', 'theta_radial']
        create_radial_bins(df, cols)

        grouped_tmp = df[[
            'driver_id', 'pick_up_radial_zone', 'pickup_latitude'
        ]].groupby(['driver_id', 'pick_up_radial_zone']).count() / df[[
            'driver_id', 'pick_up_radial_zone', 'pickup_latitude'
        ]].groupby(['driver_id'])[['pickup_latitude']].count()
        temp = grouped_tmp.unstack(level=0).T
        temp.fillna(0, inplace=True)
        temp.reset_index(inplace=True)
        temp.drop(columns=['level_0'], inplace=True)
        pd.to_msgpack(cache_path, temp)
        print(f'Dumping to {cache_path}')
    return temp
Exemplo n.º 33
0
    def save(self, label, singlefile=True):
        """Save all current predictions dataframe with some metadata"""

        if singlefile == True:
            fname = 'epit_%s_%s_%s.mpk' %(label,self.name,self.length)
            print 'saving as %s' %fname
            meta = {'method':self.name, 'length':self.length}
            pd.to_msgpack(fname, meta)
            for i,g in self.data.groupby('name'):
                pd.to_msgpack(fname, g, append=True)
        else:
            #save one file per protein/name
            path = os.path.join(label,self.name)
            print 'saving to %s' %path
            if not os.path.exists(path):
                os.makedirs(path)
            for name,df in self.data.groupby('name'):
                outfile = os.path.join(path, name+'.mpk')
                pd.to_msgpack(outfile,df)
        return
Exemplo n.º 34
0
def location_and_time_data(file_name='location_time', use_cache=True):
    """
        Returns location and time discounted rating
    """
    cache_path = os.path.join(CACHE_PATH, f'location_and_time_data.msgpack')
    if use_cache and os.path.exists(cache_path):
        print(f'Loading from {cache_path}')
        detail_df = pd.read_msgpack(cache_path)
        print(f'Loaded from {cache_path}')
    else:
        # To get review json
        line_count = len(
            open(os.path.join(EXCEL_PATH, "review.json"),
                 encoding='utf8').readlines())
        user_ids, business_ids, stars, dates = [], [], [], []
        with open(os.path.join(EXCEL_PATH, "review.json"),
                  encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                user_ids += [blob["user_id"]]
                business_ids += [blob["business_id"]]
                stars += [blob["stars"]]
                dates += [blob["date"]]

        ratings = pd.DataFrame({
            "user_id": user_ids,
            "business_id": business_ids,
            "rating": stars,
            "date": dates
        })

        line_count = len(
            open(os.path.join(EXCEL_PATH, "business.json"),
                 encoding='utf8').readlines())
        name, business_id, address, city, state, postal_code = [], [], [], [], [], []
        latitude, longitude, stars, review_count = [], [], [], []
        is_open, attributes, GoodForKids, categories, hours = [], [], [], [], []

        with open(os.path.join(EXCEL_PATH, "business.json"),
                  encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                name += [blob["name"]]
                business_id += [blob["business_id"]]
                address += [blob["address"]]
                city += [blob["city"]]
                state += [blob["state"]]
                postal_code += [blob["postal_code"]]
                latitude += [blob["latitude"]]
                longitude += [blob["longitude"]]
                stars += [blob["stars"]]
                review_count += [blob["review_count"]]
                is_open += [blob["is_open"]]

        business = pd.DataFrame({
            "name": name,
            "business_id": business_id,
            "address": address,
            "city": city,
            "state": state,
            "postal_code": postal_code,
            "latitude": latitude,
            "longitude": longitude,
            'stars': stars,
            'review_count': review_count,
            'is_open': is_open
        })

        detail_df = pd.merge(left=ratings,
                             right=business,
                             on='business_id',
                             how='left')
        mean_lat = detail_df.groupby(
            'user_id')['latitude'].mean().reset_index()
        mean_long = detail_df.groupby(
            'user_id')['longitude'].mean().reset_index()
        mean_df = pd.merge(mean_lat, mean_long, on='user_id')
        mean_df.columns = ['user_id', 'mean_lat', 'mean_long']
        detail_df = pd.merge(detail_df, mean_df, on='user_id', how='left')

        detail_df['distance'] = (
            (detail_df['mean_lat'] - detail_df['latitude'])**2) + (
                (detail_df['mean_long'] - detail_df['longitude'])**2)

        # For date distances
        detail_df['date'] = pd.to_datetime(detail_df['date'])
        last_date = detail_df.groupby('user_id')['date'].max().reset_index()
        last_date.columns = ['user_id', 'last_date']
        detail_df = pd.merge(detail_df, last_date, on='user_id', how='left')

        # Months instead of days
        detail_df['date_diff'] = (detail_df['last_date'] - detail_df['date'])
        detail_df['date_diff'] = detail_df['date_diff'].dt.days / 30

        # e(1/1+dist)/e
        # e(1/ log(date))/e
        detail_df['dist_scale'] = np.exp(
            1 / (1 + detail_df['distance'])) / (np.exp(1))
        detail_df['date_scale'] = np.exp(
            1 / (1 + np.log(detail_df['date_diff'] + 1))) / np.exp(1)

        # Multiplying rating scale with the dist
        detail_df[
            'date_rating'] = detail_df['rating'] * detail_df['date_scale']
        detail_df[
            'dist_rating'] = detail_df['rating'] * detail_df['dist_scale']
        detail_df['date_dist_rating'] = detail_df['date_scale'] * detail_df[
            'dist_scale'] * detail_df['rating']

        pd.to_msgpack(cache_path, (detail_df))
        print(f'Dumping to {cache_path}')
    return detail_df
Exemplo n.º 35
0
def all_data(file_name='all_data', use_cache=True):
    """
        Returns business and user meta data with the ratings
    """
    cache_path = os.path.join(CACHE_PATH, f'all_data.msgpack')
    if use_cache and os.path.exists(cache_path):
        print(f'Loading from {cache_path}')
        temp = pd.read_msgpack(cache_path)
        print(f'Loaded from {cache_path}')
    else:
        # To get review json
        line_count = len(
            open(os.path.join(EXCEL_PATH, "review.json"),
                 encoding='utf8').readlines())
        user_ids, business_ids, stars, dates = [], [], [], []
        with open(os.path.join(EXCEL_PATH, "review.json"),
                  encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                user_ids += [blob["user_id"]]
                business_ids += [blob["business_id"]]
                stars += [blob["stars"]]
                dates += [blob["date"]]

        ratings = pd.DataFrame({
            "user_id": user_ids,
            "business_id": business_ids,
            "rating": stars,
            "date": dates
        })

        line_count = len(
            open(os.path.join(EXCEL_PATH, "business.json"),
                 encoding='utf8').readlines())
        name, business_id, address, city, state, postal_code = [], [], [], [], [], []
        latitude, longitude, stars, review_count = [], [], [], []
        is_open, attributes, GoodForKids, categories, hours = [], [], [], [], []

        with open(os.path.join(EXCEL_PATH, "business.json"),
                  encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                name += [blob["name"]]
                business_id += [blob["business_id"]]
                address += [blob["address"]]
                city += [blob["city"]]
                state += [blob["state"]]
                postal_code += [blob["postal_code"]]
                latitude += [blob["latitude"]]
                longitude += [blob["longitude"]]
                stars += [blob["stars"]]
                review_count += [blob["review_count"]]
                is_open += [blob["is_open"]]

        business = pd.DataFrame({
            "name": name,
            "business_id": business_id,
            "address": address,
            "city": city,
            "state": state,
            "postal_code": postal_code,
            "latitude": latitude,
            "longitude": longitude,
            'stars': stars,
            'review_count': review_count,
            'is_open': is_open
        })

        # To get user json

        line_count = len(
            open(os.path.join(EXCEL_PATH, "user.json"),
                 encoding='utf8').readlines())
        name, user_id, review_count, yelping_since, useful = [], [], [], [], []
        funny, cool, elite, fans = [], [], [], []
        average_stars, compliment_hot, compliment_more, compliment_profile = [], [], [], []
        compliment_cute, compliment_list, compliment_note, compliment_plain, compliment_cool = [], [], [], [], []
        compliment_funny, compliment_writer, compliment_photos = [], [], []

        with open(os.path.join(EXCEL_PATH, "user.json"), encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                name += [blob["name"]]
                user_id += [blob["user_id"]]
                review_count += [blob["review_count"]]
                yelping_since += [blob["yelping_since"]]
                useful += [blob["useful"]]
                funny += [blob["funny"]]
                cool += [blob["cool"]]
                elite += [blob["elite"]]
                fans += [blob["fans"]]
                average_stars += [blob["average_stars"]]
                compliment_hot += [blob["compliment_hot"]]
                compliment_more += [blob["compliment_more"]]
                compliment_profile += [blob["compliment_profile"]]
                compliment_cute += [blob["compliment_cute"]]
                compliment_list += [blob["compliment_list"]]
                compliment_note += [blob["compliment_note"]]
                compliment_plain += [blob["compliment_plain"]]
                compliment_cool += [blob["compliment_cool"]]
                compliment_funny += [blob["compliment_funny"]]
                compliment_writer += [blob["compliment_writer"]]
                compliment_photos += [blob["compliment_photos"]]

        user = pd.DataFrame({
            "name": name,
            "user_id": user_id,
            "review_count": review_count,
            "yelping_since": yelping_since,
            "useful": useful,
            "funny": funny,
            "cool": cool,
            "elite": elite,
            "fans": fans,
            "average_stars": average_stars,
            "compliment_hot": compliment_hot,
            "compliment_more": compliment_more,
            "compliment_profile": compliment_profile,
            "compliment_cute": compliment_cute,
            "compliment_list": compliment_list,
            "compliment_note": compliment_note,
            "compliment_plain": compliment_plain,
            "compliment_cool": compliment_cool,
            "compliment_funny": compliment_funny,
            "compliment_writer": compliment_writer,
            "compliment_photos": compliment_photos
        })

        # To get tip json

        line_count = len(
            open(os.path.join(EXCEL_PATH, "tip.json"),
                 encoding='utf8').readlines())
        business_id, user_id, text, date, compliment_count = [], [], [], [], []

        with open(os.path.join(EXCEL_PATH, "tip.json"), encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                business_id += [blob["business_id"]]
                user_id += [blob["user_id"]]
                text += [blob["text"]]
                date += [blob["date"]]
                compliment_count += [blob["compliment_count"]]

        tip = pd.DataFrame({
            "business_id": business_id,
            "user_id": user_id,
            "text": text,
            "date": date,
            "compliment_count": compliment_count
        })
        temp = pd.merge(ratings, business, on='business_id', how='left')
        temp = pd.merge(temp, user, on='user_id', how='left')
        temp = pd.merge(temp,
                        tip,
                        on=['business_id', 'user_id', 'date'],
                        how='left')

        pd.to_msgpack(cache_path, (temp))
        print(f'Dumping to {cache_path}')
    return temp