def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000): '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq''' # TODO make work and allow user to provide arguments to function if col is None: col = self.active_col startrow = 0 while startrow < len(self): fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize) ndf = bqresult_2_df(fields, data) ndf[col + '_mod'] = ndf[col].apply(func) if dest is None: dest = self.remote + '_mod_%s' % col ndf = ndf[[col + '_mod']] _, _ = write_df_to_remote(self.con, ndf, overwrite_method='append', **bqutil.dictify(dest)) startrow += chunksize if not self._check_write(dest): warnings.warn('remote writing of UDF apply function failed') combined_df = BQDF(self.con, dest) return combined_df
def slice(self, start=0, end=10): # NOTE need to fit slice locally # see if there is a bigquery way to do this fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=start, max_rows=end - start) ndf = bqresult_2_df(fields, data) dest = self.remote + '_slice_%sto%s' % (start, end) _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn('failed to write new slice to bigquery') ndf = BQDF(self.con, dest) ndf.refresh() return ndf
def slice(self, start=0, end=10): # NOTE need to fit slice locally # see if there is a bigquery way to do this fields, data = self.con.client.ReadSchemaAndRows(bqutil.dictify( self.remote), start_row=start, max_rows=end - start) ndf = bqresult_2_df(fields, data) dest = self.remote + '_slice_%sto%s' % (start, end) _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn('failed to write new slice to bigquery') ndf = BQDF(self.con, dest) ndf.refresh() return ndf
def _get_nth_row(self, n): fields, data = con.client.ReadSchemaAndRows(bqutil.dictify( self.remote), start_row=n, max_rows=1) result = {f['name']: d for f, d in zip(fields, data[0])} return result
def groupby_apply(self, groupingcol, func, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None): ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc) # TODO make work and allow user to provide arguments groups data by grouping column and performs requested operations on other columns INPUTS: groupingcol (str): column to group on func (python function): takes arbitrary python function that acts on all data in a group columns (list): list of column names to touch with function OUTPUTS: ndf: BQDF instance for result ''' dest = None if columns is None: columns is self.columns for group in self.unique(groupingcol): group_query = "SELECT %s FROM %s WHERE %s == %s" ( ', '.join(columns), self.tablename, groupingcol, group) ndf = self.query(group_query, fetch=True, dest=dest) applied_ndf = func(ndf.local) if dest is None: gdf = self.query(group_query, fetch=True, dest=None) dest = gdf.remote _, _ = write_df_to_remote( self.con, applied_ndf, overwrite_method='append', **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn( 'remote writing of UDF groupby-apply function failed') gdf = BQDF(self.con, '%s' % dest) return gdf
def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000): '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq''' # TODO make work and allow user to provide arguments to function if col is None: col = self.active_col startrow = 0 while startrow < len(self): fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize) ndf = bqresult_2_df(fields, data) ndf[col + '_mod'] = ndf[col].apply(func) if dest is None: dest = self.remote + '_mod_%s' % col ndf = ndf[[col + '_mod']] _, _ = write_df_to_remote( self.con, ndf, overwrite_method='append', **bqutil.dictify(dest)) startrow += chunksize if not self._check_write(dest): warnings.warn('remote writing of UDF apply function failed') combined_df = BQDF(self.con, dest) return combined_df
def create_column_from_values(con, col, content, remotetable, length=None): '''create new dataframe with column content (which can then be joined with existing table)''' d = bqutil.dictify(remotetable) d['tableId'] = d['tableId'] + '_newcol_' + \ str(np.random.randint(1000, 10000)) if not hasattr(content, '__iter__'): try: np.isnan(content) except TypeError: content = unicode(content) content = [content for i in range(length)] df = pd.DataFrame({col: content}) con, dest = write_df_to_remote( con, df, overwrite_method='fail', projectId=d['projectId'], datasetId=d['datasetId'], tableId=d['tableId']) return dest
def _check_write(self, newremote, timeout=10): '''query from a newly created table (waits until table has been fully inserted)''' loaded = False start_time = time() elapsed_time = 0 while not loaded: if elapsed_time < timeout: resource = bqutil.get_table_resource(self.con, bqutil.dictify(newremote)) # won't contain this attribute while actively streaming # insertions if 'numRows' in resource: if int(resource['numRows']) > 0: return True elapsed_time = time() - start_time sleep(.5) else: return False
def _check_write(self, newremote, timeout=10): '''query from a newly created table (waits until table has been fully inserted)''' loaded = False start_time = time() elapsed_time = 0 while not loaded: if elapsed_time < timeout: resource = bqutil.get_table_resource( self.con, bqutil.dictify(newremote)) # won't contain this attribute while actively streaming # insertions if 'numRows' in resource: if int(resource['numRows']) > 0: return True elapsed_time = time() - start_time sleep(.5) else: return False
def create_column_from_values(con, col, content, remotetable, length=None): '''create new dataframe with column content (which can then be joined with existing table)''' d = bqutil.dictify(remotetable) d['tableId'] = d['tableId'] + '_newcol_' + \ str(np.random.randint(1000, 10000)) if not hasattr(content, '__iter__'): try: np.isnan(content) except TypeError: content = unicode(content) content = [content for i in range(length)] df = pd.DataFrame({col: content}) con, dest = write_df_to_remote(con, df, overwrite_method='fail', projectId=d['projectId'], datasetId=d['datasetId'], tableId=d['tableId']) return dest
def batch_df_to_remote(con, df, overwrite_method='fail', delete='True', name=None, projectId=None, datasetId=None, tableId=None): '''write pandas dataframe as bigquery table''' schema = {"fields": bqutil.bqjson_from_df(df, dumpjson=False)} table_ref = { 'tableId': tableId, 'datasetId': datasetId, 'projectId': projectId } if overwrite_method == 'append': write_disposition = 'WRITE_APPEND' elif overwrite_method == 'overwrite': write_disposition = 'WRITE_TRUNCATE' else: write_disposition = 'WRITE_EMPTY' df.to_csv(tableId + '.csv', index=False) filename = os.path.join(os.getcwd(), tableId + '.csv') project = bqutil.dictify(self.remote)['projectId'] if name is None: name = datasetId + tableId bqutil.file_to_bucket(con, project, self.bucket, filename, name=name) jobref = bucket_to_bq(con, table_ref, projectId, bucket, name, schema=schema, write_disposition=write_disposition, wait=True) if delete: delete_from_bucket(con, project, bucket, name) return con, bqutil.stringify(table_ref)
def batch_df_to_remote(con, df, overwrite_method='fail', delete='True', name=None, projectId=None, datasetId=None, tableId=None): '''write pandas dataframe as bigquery table''' schema = {"fields": bqutil.bqjson_from_df(df, dumpjson=False)} table_ref = {'tableId': tableId, 'datasetId': datasetId, 'projectId': projectId} if overwrite_method == 'append': write_disposition = 'WRITE_APPEND' elif overwrite_method == 'overwrite': write_disposition = 'WRITE_TRUNCATE' else: write_disposition = 'WRITE_EMPTY' df.to_csv(tableId + '.csv', index=False) filename = os.path.join(os.getcwd(), tableId + '.csv') project = bqutil.dictify(self.remote)['projectId'] if name is None: name = datasetId + tableId bqutil.file_to_bucket(con, project, self.bucket, filename, name=name) jobref = bucket_to_bq(con, table_ref, projectId, bucket, name, schema=schema, write_disposition=write_disposition, wait=True) if delete: delete_from_bucket(con, project, bucket, name) return con, bqutil.stringify(table_ref)
def groupby_apply(self, groupingcol, func, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None): ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc) # TODO make work and allow user to provide arguments groups data by grouping column and performs requested operations on other columns INPUTS: groupingcol (str): column to group on func (python function): takes arbitrary python function that acts on all data in a group columns (list): list of column names to touch with function OUTPUTS: ndf: BQDF instance for result ''' dest = None if columns is None: columns is self.columns for group in self.unique(groupingcol): group_query = "SELECT %s FROM %s WHERE %s == %s" ( ', '.join(columns), self.tablename, groupingcol, group) ndf = self.query(group_query, fetch=True, dest=dest) applied_ndf = func(ndf.local) if dest is None: gdf = self.query(group_query, fetch=True, dest=None) dest = gdf.remote _, _ = write_df_to_remote(self.con, applied_ndf, overwrite_method='append', **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn( 'remote writing of UDF groupby-apply function failed') gdf = BQDF(self.con, '%s' % dest) return gdf
def get_resource(self, remote): '''fetch info about remote table''' return self.con.client._apiclient.tables().get( **bqutil.dictify(remote)).execute()
def _get_nth_row(self, n): fields, data = con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=n, max_rows=1) result = {f['name']: d for f, d in zip(fields, data[0])} return result
def get_resource(self, remote): '''fetch info about remote table''' return self.con.client._apiclient.tables().get(**bqutil.dictify(remote)).execute()