Пример #1
0
def raw_query(con,
              querystr,
              last_modified,
              dest=None,
              max_rows=cfg.MAX_ROWS,
              fetch=cfg.FETCH_BY_DEFAULT,
              overwrite_method='fail'):
    '''executes a query and returns the results or a result sample as a pandas df and the destination table as a dict

    INPUTS:
        querystr (str):
        dest (dict): specify destination table for output of query (if None, BQ creates a temporary (24hr) table)
        max_rows (int): max number of rows that the con will return in the results
        fetch (bool): if True, fetch the full resultset locally, otherwise return only a sample of the first 5 rows
    OUTPUTS:
        result (pandas dataframe): dataframe containing the query results or
            first 5 rows or resultset (if fetch==True)
        destinationtable (dict): remote table that contains the query results
    '''
    exists = con._check_query(querystr, fetch, last_modified)
    if overwrite_method == 'append':
        write_disposition = 'WRITE_APPEND'
    elif overwrite_method == 'overwrite':
        write_disposition = 'WRITE_TRUNCATE'
    else:
        write_disposition = 'WRITE_EMPTY'
    if not exists:
        query_response = run_query(con,
                                   querystr,
                                   destination_table=dest,
                                   write_disposition=write_disposition)
        if fetch:
            fields, data = fetch_query(con,
                                       query_response,
                                       start_row=0,
                                       max_rows=max_rows)
            df, source = bqresult_2_df(
                fields, data
            ), query_response['configuration']['query']['destinationTable']
            con._cache_query(querystr, df, source, fetch)
            if con.client._apiclient.tables().get(
                    **source).execute()['numRows'] > max_rows:
                exceeds_max_rows = True
            else:
                exceeds_max_rows = False
            return df, source, exceeds_max_rows

        else:
            fields, data = fetch_query(con,
                                       query_response,
                                       start_row=0,
                                       max_rows=5)
            head_sample = bqresult_2_df(fields, data)
            df, source = head_sample, query_response['configuration']['query'][
                'destinationTable']
            exceeds_max_rows = False
            con._cache_query(querystr, df, source, fetch)
            return df, source, exceeds_max_rows
    else:
        return con._fetch_from_cache(querystr)
Пример #2
0
 def apply(self,
           func,
           col=None,
           columns=None,
           max_rows=cfg.MAX_ROWS,
           fetch=True,
           dest=None,
           chunksize=10000):
     '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq'''
     # TODO make work and allow user to provide arguments to function
     if col is None:
         col = self.active_col
     startrow = 0
     while startrow < len(self):
         fields, data = self.con.client.ReadSchemaAndRows(
             bqutil.dictify(self.remote),
             start_row=startrow,
             max_rows=chunksize)
         ndf = bqresult_2_df(fields, data)
         ndf[col + '_mod'] = ndf[col].apply(func)
         if dest is None:
             dest = self.remote + '_mod_%s' % col
         ndf = ndf[[col + '_mod']]
         _, _ = write_df_to_remote(self.con,
                                   ndf,
                                   overwrite_method='append',
                                   **bqutil.dictify(dest))
         startrow += chunksize
     if not self._check_write(dest):
         warnings.warn('remote writing of UDF apply function failed')
     combined_df = BQDF(self.con, dest)
     return combined_df
Пример #3
0
def raw_query(con, querystr, last_modified, dest=None, max_rows=cfg.MAX_ROWS, fetch=cfg.FETCH_BY_DEFAULT, overwrite_method='fail'):
    '''executes a query and returns the results or a result sample as a pandas df and the destination table as a dict

    INPUTS:
        querystr (str):
        dest (dict): specify destination table for output of query (if None, BQ creates a temporary (24hr) table)
        max_rows (int): max number of rows that the con will return in the results
        fetch (bool): if True, fetch the full resultset locally, otherwise return only a sample of the first 5 rows
    OUTPUTS:
        result (pandas dataframe): dataframe containing the query results or
            first 5 rows or resultset (if fetch==True)
        destinationtable (dict): remote table that contains the query results
    '''
    exists = con._check_query(querystr, fetch, last_modified)
    if overwrite_method == 'append':
        write_disposition = 'WRITE_APPEND'
    elif overwrite_method == 'overwrite':
        write_disposition = 'WRITE_TRUNCATE'
    else:
        write_disposition = 'WRITE_EMPTY'
    if not exists:
        query_response = run_query(
            con, querystr, destination_table=dest, write_disposition=write_disposition)
        if fetch:
            fields, data = fetch_query(
                con, query_response, start_row=0, max_rows=max_rows)
            df, source = bqresult_2_df(fields, data), query_response[
                'configuration']['query']['destinationTable']
            con._cache_query(querystr, df, source, fetch)
            if con.client._apiclient.tables().get(**source).execute()['numRows'] > max_rows:
                exceeds_max_rows = True
            else:
                exceeds_max_rows = False
            return df, source, exceeds_max_rows

        else:
            fields, data = fetch_query(
                con, query_response, start_row=0, max_rows=5)
            head_sample = bqresult_2_df(fields, data)
            df, source = head_sample, query_response[
                'configuration']['query']['destinationTable']
            exceeds_max_rows = False
            con._cache_query(querystr, df, source, fetch)
            return df, source, exceeds_max_rows
    else:
        return con._fetch_from_cache(querystr)
Пример #4
0
 def slice(self, start=0, end=10):
     # NOTE need to fit slice locally
     # see if there is a bigquery way to do this
     fields, data = self.con.client.ReadSchemaAndRows(
         bqutil.dictify(self.remote), start_row=start, max_rows=end - start)
     ndf = bqresult_2_df(fields, data)
     dest = self.remote + '_slice_%sto%s' % (start, end)
     _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn('failed to write new slice to bigquery')
     ndf = BQDF(self.con, dest)
     ndf.refresh()
     return ndf
Пример #5
0
 def slice(self, start=0, end=10):
     # NOTE need to fit slice locally
     # see if there is a bigquery way to do this
     fields, data = self.con.client.ReadSchemaAndRows(bqutil.dictify(
         self.remote),
                                                      start_row=start,
                                                      max_rows=end - start)
     ndf = bqresult_2_df(fields, data)
     dest = self.remote + '_slice_%sto%s' % (start, end)
     _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn('failed to write new slice to bigquery')
     ndf = BQDF(self.con, dest)
     ndf.refresh()
     return ndf
Пример #6
0
 def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000):
     '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq'''
     # TODO make work and allow user to provide arguments to function
     if col is None:
         col = self.active_col
     startrow = 0
     while startrow < len(self):
         fields, data = self.con.client.ReadSchemaAndRows(
             bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize)
         ndf = bqresult_2_df(fields, data)
         ndf[col + '_mod'] = ndf[col].apply(func)
         if dest is None:
             dest = self.remote + '_mod_%s' % col
         ndf = ndf[[col + '_mod']]
         _, _ = write_df_to_remote(
             self.con, ndf, overwrite_method='append', **bqutil.dictify(dest))
         startrow += chunksize
     if not self._check_write(dest):
         warnings.warn('remote writing of UDF apply function failed')
     combined_df = BQDF(self.con, dest)
     return combined_df