Пример #1
0
 def apply(self,
           func,
           col=None,
           columns=None,
           max_rows=cfg.MAX_ROWS,
           fetch=True,
           dest=None,
           chunksize=10000):
     '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq'''
     # TODO make work and allow user to provide arguments to function
     if col is None:
         col = self.active_col
     startrow = 0
     while startrow < len(self):
         fields, data = self.con.client.ReadSchemaAndRows(
             bqutil.dictify(self.remote),
             start_row=startrow,
             max_rows=chunksize)
         ndf = bqresult_2_df(fields, data)
         ndf[col + '_mod'] = ndf[col].apply(func)
         if dest is None:
             dest = self.remote + '_mod_%s' % col
         ndf = ndf[[col + '_mod']]
         _, _ = write_df_to_remote(self.con,
                                   ndf,
                                   overwrite_method='append',
                                   **bqutil.dictify(dest))
         startrow += chunksize
     if not self._check_write(dest):
         warnings.warn('remote writing of UDF apply function failed')
     combined_df = BQDF(self.con, dest)
     return combined_df
Пример #2
0
 def slice(self, start=0, end=10):
     # NOTE need to fit slice locally
     # see if there is a bigquery way to do this
     fields, data = self.con.client.ReadSchemaAndRows(
         bqutil.dictify(self.remote), start_row=start, max_rows=end - start)
     ndf = bqresult_2_df(fields, data)
     dest = self.remote + '_slice_%sto%s' % (start, end)
     _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn('failed to write new slice to bigquery')
     ndf = BQDF(self.con, dest)
     ndf.refresh()
     return ndf
Пример #3
0
 def slice(self, start=0, end=10):
     # NOTE need to fit slice locally
     # see if there is a bigquery way to do this
     fields, data = self.con.client.ReadSchemaAndRows(bqutil.dictify(
         self.remote),
                                                      start_row=start,
                                                      max_rows=end - start)
     ndf = bqresult_2_df(fields, data)
     dest = self.remote + '_slice_%sto%s' % (start, end)
     _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn('failed to write new slice to bigquery')
     ndf = BQDF(self.con, dest)
     ndf.refresh()
     return ndf
Пример #4
0
 def _get_nth_row(self, n):
     fields, data = con.client.ReadSchemaAndRows(bqutil.dictify(
         self.remote),
                                                 start_row=n,
                                                 max_rows=1)
     result = {f['name']: d for f, d in zip(fields, data[0])}
     return result
Пример #5
0
 def groupby_apply(self, groupingcol, func, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None):
     ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc)
     # TODO make work and allow user to provide arguments
     groups data by grouping column and performs requested operations on other columns
     INPUTS:
         groupingcol (str): column to group on
         func (python function): takes arbitrary python function that acts on all data in a group
         columns (list): list of column names to touch with function
     OUTPUTS:
        ndf: BQDF instance for result
     '''
     dest = None
     if columns is None:
         columns is self.columns
     for group in self.unique(groupingcol):
         group_query = "SELECT %s FROM %s WHERE  %s == %s" (
             ', '.join(columns), self.tablename, groupingcol, group)
         ndf = self.query(group_query, fetch=True, dest=dest)
         applied_ndf = func(ndf.local)
         if dest is None:
             gdf = self.query(group_query, fetch=True, dest=None)
             dest = gdf.remote
         _, _ = write_df_to_remote(
             self.con, applied_ndf, overwrite_method='append', **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn(
             'remote writing of UDF groupby-apply function failed')
     gdf = BQDF(self.con, '%s' % dest)
     return gdf
Пример #6
0
 def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000):
     '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq'''
     # TODO make work and allow user to provide arguments to function
     if col is None:
         col = self.active_col
     startrow = 0
     while startrow < len(self):
         fields, data = self.con.client.ReadSchemaAndRows(
             bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize)
         ndf = bqresult_2_df(fields, data)
         ndf[col + '_mod'] = ndf[col].apply(func)
         if dest is None:
             dest = self.remote + '_mod_%s' % col
         ndf = ndf[[col + '_mod']]
         _, _ = write_df_to_remote(
             self.con, ndf, overwrite_method='append', **bqutil.dictify(dest))
         startrow += chunksize
     if not self._check_write(dest):
         warnings.warn('remote writing of UDF apply function failed')
     combined_df = BQDF(self.con, dest)
     return combined_df
Пример #7
0
def create_column_from_values(con, col, content, remotetable, length=None):
    '''create new dataframe with column content (which can then be joined with existing table)'''
    d = bqutil.dictify(remotetable)
    d['tableId'] = d['tableId'] + '_newcol_' + \
        str(np.random.randint(1000, 10000))
    if not hasattr(content, '__iter__'):
        try:
            np.isnan(content)
        except TypeError:
            content = unicode(content)
        content = [content for i in range(length)]
    df = pd.DataFrame({col: content})
    con, dest = write_df_to_remote(
        con, df, overwrite_method='fail', projectId=d['projectId'], datasetId=d['datasetId'], tableId=d['tableId'])
    return dest
Пример #8
0
 def _check_write(self, newremote, timeout=10):
     '''query from a newly created table (waits until table has been fully inserted)'''
     loaded = False
     start_time = time()
     elapsed_time = 0
     while not loaded:
         if elapsed_time < timeout:
             resource = bqutil.get_table_resource(self.con,
                                                  bqutil.dictify(newremote))
             # won't contain this attribute while actively streaming
             # insertions
             if 'numRows' in resource:
                 if int(resource['numRows']) > 0:
                     return True
             elapsed_time = time() - start_time
             sleep(.5)
         else:
             return False
Пример #9
0
 def _check_write(self, newremote, timeout=10):
     '''query from a newly created table (waits until table has been fully inserted)'''
     loaded = False
     start_time = time()
     elapsed_time = 0
     while not loaded:
         if elapsed_time < timeout:
             resource = bqutil.get_table_resource(
                 self.con, bqutil.dictify(newremote))
             # won't contain this attribute while actively streaming
             # insertions
             if 'numRows' in resource:
                 if int(resource['numRows']) > 0:
                     return True
             elapsed_time = time() - start_time
             sleep(.5)
         else:
             return False
Пример #10
0
def create_column_from_values(con, col, content, remotetable, length=None):
    '''create new dataframe with column content (which can then be joined with existing table)'''
    d = bqutil.dictify(remotetable)
    d['tableId'] = d['tableId'] + '_newcol_' + \
        str(np.random.randint(1000, 10000))
    if not hasattr(content, '__iter__'):
        try:
            np.isnan(content)
        except TypeError:
            content = unicode(content)
        content = [content for i in range(length)]
    df = pd.DataFrame({col: content})
    con, dest = write_df_to_remote(con,
                                   df,
                                   overwrite_method='fail',
                                   projectId=d['projectId'],
                                   datasetId=d['datasetId'],
                                   tableId=d['tableId'])
    return dest
Пример #11
0
def batch_df_to_remote(con,
                       df,
                       overwrite_method='fail',
                       delete='True',
                       name=None,
                       projectId=None,
                       datasetId=None,
                       tableId=None):
    '''write pandas dataframe as bigquery table'''
    schema = {"fields": bqutil.bqjson_from_df(df, dumpjson=False)}
    table_ref = {
        'tableId': tableId,
        'datasetId': datasetId,
        'projectId': projectId
    }
    if overwrite_method == 'append':
        write_disposition = 'WRITE_APPEND'
    elif overwrite_method == 'overwrite':
        write_disposition = 'WRITE_TRUNCATE'
    else:
        write_disposition = 'WRITE_EMPTY'
    df.to_csv(tableId + '.csv', index=False)
    filename = os.path.join(os.getcwd(), tableId + '.csv')
    project = bqutil.dictify(self.remote)['projectId']
    if name is None:
        name = datasetId + tableId
    bqutil.file_to_bucket(con, project, self.bucket, filename, name=name)
    jobref = bucket_to_bq(con,
                          table_ref,
                          projectId,
                          bucket,
                          name,
                          schema=schema,
                          write_disposition=write_disposition,
                          wait=True)
    if delete:
        delete_from_bucket(con, project, bucket, name)
    return con, bqutil.stringify(table_ref)
Пример #12
0
def batch_df_to_remote(con, df, overwrite_method='fail', delete='True', name=None, projectId=None, datasetId=None, tableId=None):
    '''write pandas dataframe as bigquery table'''
    schema = {"fields": bqutil.bqjson_from_df(df, dumpjson=False)}
    table_ref = {'tableId': tableId,
                 'datasetId': datasetId,
                 'projectId': projectId}
    if overwrite_method == 'append':
        write_disposition = 'WRITE_APPEND'
    elif overwrite_method == 'overwrite':
        write_disposition = 'WRITE_TRUNCATE'
    else:
        write_disposition = 'WRITE_EMPTY'
    df.to_csv(tableId + '.csv', index=False)
    filename = os.path.join(os.getcwd(), tableId + '.csv')
    project = bqutil.dictify(self.remote)['projectId']
    if name is None:
        name = datasetId + tableId
    bqutil.file_to_bucket(con, project, self.bucket, filename, name=name)
    jobref = bucket_to_bq(con, table_ref, projectId, bucket, name,
                          schema=schema, write_disposition=write_disposition, wait=True)
    if delete:
        delete_from_bucket(con, project, bucket, name)
    return con, bqutil.stringify(table_ref)
Пример #13
0
 def groupby_apply(self,
                   groupingcol,
                   func,
                   columns=None,
                   max_rows=cfg.MAX_ROWS,
                   fetch=True,
                   dest=None):
     ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc)
     # TODO make work and allow user to provide arguments
     groups data by grouping column and performs requested operations on other columns
     INPUTS:
         groupingcol (str): column to group on
         func (python function): takes arbitrary python function that acts on all data in a group
         columns (list): list of column names to touch with function
     OUTPUTS:
        ndf: BQDF instance for result
     '''
     dest = None
     if columns is None:
         columns is self.columns
     for group in self.unique(groupingcol):
         group_query = "SELECT %s FROM %s WHERE  %s == %s" (
             ', '.join(columns), self.tablename, groupingcol, group)
         ndf = self.query(group_query, fetch=True, dest=dest)
         applied_ndf = func(ndf.local)
         if dest is None:
             gdf = self.query(group_query, fetch=True, dest=None)
             dest = gdf.remote
         _, _ = write_df_to_remote(self.con,
                                   applied_ndf,
                                   overwrite_method='append',
                                   **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn(
             'remote writing of UDF groupby-apply function failed')
     gdf = BQDF(self.con, '%s' % dest)
     return gdf
Пример #14
0
 def get_resource(self, remote):
     '''fetch info about remote table'''
     return self.con.client._apiclient.tables().get(
         **bqutil.dictify(remote)).execute()
Пример #15
0
 def _get_nth_row(self, n):
     fields, data = con.client.ReadSchemaAndRows(
         bqutil.dictify(self.remote), start_row=n, max_rows=1)
     result = {f['name']: d for f, d in zip(fields, data[0])}
     return result
Пример #16
0
 def get_resource(self, remote):
     '''fetch info about remote table'''
     return self.con.client._apiclient.tables().get(**bqutil.dictify(remote)).execute()