def s3_download_data(s3_filename, prefix=False, output_file=None, sep=',', skip_empty_files=True, first_row_columns=True): key_id = config.get_value('aws', 'aws_access_key_id') access_key = config.get_value('aws', 'aws_secret_access_key') session = boto3.Session( aws_access_key_id=key_id, aws_secret_access_key=access_key ) s3 = session.resource('s3') s3_file_pattern = re.compile(r's3://([^/]+)/?(.*)') match = s3_file_pattern.match(s3_filename) bucket_name = match.group(1) key_prefix = match.group(2) if type(sep) == str and len(sep) == 1: data = _download_data(key_prefix, s3, bucket_name, prefix, sep, skip_empty_files, first_row_columns) if output_file: data.to_csv(output_file, sep=sep) return data else: print('Separator must be a 1-character string')
def done_athena(query_id, filename=None): try: s3_bucket = config.get_value('aws', 's3bucket_name') key_id = config.get_value('aws', 'aws_access_key_id') access_key = config.get_value('aws', 'aws_secret_access_key') region = config.get_value('aws', 'aws_region') except (KeyError, NoOptionError): print('No credentials were provided') return pd.DataFrame([]) if s3_bucket.startswith('s3://'): s3_bucket = s3_bucket.replace('s3://', '') session = boto3.Session(aws_access_key_id=key_id, aws_secret_access_key=access_key) s3 = session.resource('s3') athena = session.client('athena', region_name=region) result = poll_status(athena, query_id) if result is None: return pd.DataFrame([]) if result['QueryExecution']['Status']['State'] == 'SUCCEEDED': s3_key = urlparse(result['QueryExecution']['ResultConfiguration'] ['OutputLocation']).path[1:] return download_file(s3, s3_bucket, s3_key, filename) else: print('Query did not succeed')
def get_data_from_moat(moat_dict, database_name): """ Function that downloads data from MOAT through API to pandas DataFrame. Args: moat_dict (dict): dictionary with keys: 'start' (str : str) (start date of analysis 'YYYYMMDD') - obligatory, 'end' (str : str) (end date of analysis 'YYYYMMDD') - obligatory, 'columns' (str : list of str) (metrics in list) - obligatory, 'level1' (str : str) (company specific) - optional, 'level2' (str : str) (company specific) - optional, 'level3' (str : str) (company specific) - optional, 'level4' (str : str) (company specific) - optional database_name (str): name of db. Values (names of db and id provided by MOAT) need to be defined in config file Returns: pandas DataFrame Full documentation of MOAT API is available at http://api.moat.com/docs. """ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) if not validate_input_dict(moat_dict): return pd.DataFrame([]) try: token = config.get_value('moat', 'token') except (KeyError, NoOptionError): print('No credentials were provided') return pd.DataFrame([]) try: db_id = config.get_value('moat_db', database_name) except (KeyError, NoOptionError): print('Such database name is not available. Please check config file') return pd.DataFrame([]) moat_dict['columns'] = ','.join(moat_dict['columns']) moat_dict['brandId'] = db_id http = urllib3.PoolManager() auth_header = 'Bearer {}'.format(token) resp = http.request('GET', 'https://api.moat.com/1/stats.json', fields=moat_dict, headers={'Authorization': auth_header}) data = json.loads(resp.data) if 'error' in data.keys(): print('Error: ' + data['error']) return pd.DataFrame([]) if data['results']['details'] == [[]]: print('Data returned is empty') return pd.DataFrame([]) df = pd.DataFrame(data['results']['details']) return df
def query_athena(query, filename=None): try: s3_bucket = config.get_value('aws', 's3bucket_name') key_id = config.get_value('aws', 'aws_access_key_id') access_key = config.get_value('aws', 'aws_secret_access_key') region = config.get_value('aws', 'aws_region') except (KeyError, NoOptionError): print('No credentials were provided') return pd.DataFrame([]) session = boto3.Session(aws_access_key_id=key_id, aws_secret_access_key=access_key) athena = session.client('athena', region_name=region) s3 = session.resource('s3') if not s3_bucket.startswith('s3://'): output_s3_bucket = 's3://' + s3_bucket else: output_s3_bucket = s3_bucket s3_bucket = s3_bucket.replace('s3://', '') try: result = athena.start_query_execution(QueryString=query, ResultConfiguration={ 'OutputLocation': output_s3_bucket, }) except ClientError: print( 'Please check your credentials including s3_bucket in config.ini file' ) return pd.DataFrame([]) except EndpointConnectionError: print( 'Please check your credentials including aws_region in config.ini file' ) return pd.DataFrame([]) query_id = result['QueryExecutionId'] result = poll_status(athena, query_id) if result['QueryExecution']['Status']['State'] == 'SUCCEEDED': s3_key = query_id + '.csv' return download_file(s3, s3_bucket, s3_key, filename) else: print('Query did not succeed')
def s3_download_data(s3_filename, prefix=False, output_file=None, sep=','): key_id = config.get_value('aws', 'aws_access_key_id') access_key = config.get_value('aws', 'aws_secret_access_key') session = boto3.Session(aws_access_key_id=key_id, aws_secret_access_key=access_key) s3 = session.resource('s3') s3_file_pattern = re.compile(r's3://([^/]+)/?(.*)') match = s3_file_pattern.match(s3_filename) bucket_name = match.group(1) key_prefix = match.group(2) data = _download_data(key_prefix, s3, bucket_name, prefix, sep) if output_file: data.to_csv(output_file, sep=sep) return data
def done_athena(query_id, filename=None): if not input_check(query_id, [str]): return return_on_exception(filename) if not input_check(filename, [str, type(None)]): return return_on_exception(filename) try: s3_bucket = config.get_value('aws', 's3bucket_name') key_id = config.get_value('aws', 'aws_access_key_id') access_key = config.get_value('aws', 'aws_secret_access_key') region = config.get_value('aws', 'aws_region') except (KeyError, NoOptionError) as e: print('All or part of credentials were not provided. Please verify config.ini file. Error message:') print(e) return return_on_exception(filename) if s3_bucket.startswith('s3://'): s3_bucket = s3_bucket.replace('s3://', '') session = boto3.Session( aws_access_key_id=key_id, aws_secret_access_key=access_key ) s3 = session.resource('s3') athena = session.client('athena', region_name=region) result = poll_status(athena, query_id) if result is None: return return_on_exception(filename) if result['QueryExecution']['Status']['State'] == 'SUCCEEDED': s3_key = urlparse(result['QueryExecution']['ResultConfiguration']['OutputLocation']).path[1:] return download_file(s3, s3_bucket, s3_key, filename) else: print('Query did not succeed. Reason:') print(result['QueryExecution']['Status']['StateChangeReason']) return return_on_exception(filename)
def s3_upload_data(data, bucket, path, sep=','): key_id = config.get_value('aws', 'aws_access_key_id') access_key = config.get_value('aws', 'aws_secret_access_key') session = boto3.Session( aws_access_key_id=key_id, aws_secret_access_key=access_key ) if type(sep) == str and len(sep) == 1: csv_buffer = StringIO() if type(data) == pd.core.frame.DataFrame or type(data) == np.ndarray: if type(data) == pd.core.frame.DataFrame: data.to_csv(csv_buffer, sep=sep) elif type(data) == np.ndarray: np.savetxt(csv_buffer, data, delimiter=sep, fmt='%s') s3 = session.resource('s3') data = csv_buffer.getvalue() try: s3.Bucket(bucket).put_object(Key=path, Body=data) print('Success. File saved at s3://{}/{}'.format(bucket, path)) except TypeError: print('Bucket name must be a string') except ClientError as e: if e.response['Error']['Code'] == 'NoSuchBucket': print('The specified bucket does not exist') except ParamValidationError as e: print(e) else: print('Uploaded file must be pandas DataFrame or numpy array and not {}'.format(type(data))) else: print('Separator must be a 1-character string')
def init_gam_connection(network_code=None): if not network_code: try: network_code = config.get_value('google_ad_manager', 'network_code') except (KeyError, NoOptionError): print('No network code was provided') return pd.DataFrame([]) yaml_string = "ad_manager: " + "\n" + \ " application_name: " + APPLICATION_NAME + "\n" + \ " network_code: " + str(network_code) + "\n" + \ " path_to_private_key_file: " + KEY_FILE + "\n" # Initialize the GAM client. gam_client = ad_manager.AdManagerClient.LoadFromString(yaml_string) return gam_client
def get_options_from_config(): # Set the options in a dictionary, in order to pass only the # options that were provided in the configuration file to the # MySQL connector. Passing empty values would trigger exceptions. options = dict() try: options["host"] = config.get_value('mysql', 'host') except (KeyError, NoOptionError): # Do nothing, this value is optional. pass try: options["port"] = config.get_value('mysql', 'port') except (KeyError, NoOptionError): # Do nothing, this value is optional. pass try: options["user"] = config.get_value('mysql', 'user') except (KeyError, NoOptionError): # Do nothing, this value is optional. pass try: options["password"] = config.get_value('mysql', 'password') except (KeyError, NoOptionError): # Do nothing, this value is optional. pass try: options["unix_socket"] = config.get_value('mysql', 'unix_socket') except (KeyError, NoOptionError): # Do nothing, this value is optional. pass try: options["database"] = config.get_value('mysql', 'database') except (KeyError, NoOptionError): # Do nothing, this value is optional. pass return options
def get_data_from_rubicon(rubicon_dict, currency='USD'): """ Function that download data from Rubicon db through API to pandas DataFrame. Args: rubicon_dict (dict): dictionary with keys: 'start' (str : str) (Start date in ISO-8601 format, including time zone.) - obligatory, 'end' (str : str) (End date in ISO-8601 format, including time zone.) - obligatory, 'dimensions' (str : list of str) (dimensions that we want included as columns in list) - obligatory, 'metrics' (str : list of str) (metrics that we want included as columns in list) - obligatory 'filters' (str: list of str) (filters that we want to be included in list) - obligatory currency (str): currency to be used Returns: pandas DataFrame Full documentation (additional features) of Rubicon API is available at: https://resources.rubiconproject.com/resource/publisher-resources/performance-analytics-api/ (you need to be logged in with provided credentials) """ try: username = config.get_value('rubicon', 'username') password = config.get_value('rubicon', 'password') rubicon_id = config.get_value('rubicon', 'id') except (KeyError, NoOptionError): print('No credentials were provided') return pd.DataFrame([]) if 'start' not in rubicon_dict.keys() or 'end' not in rubicon_dict.keys(): print('Required fields are not set') return pd.DataFrame([]) rubicon_dict['currency'] = currency try: if not rubicon_dict['dimensions'] or not rubicon_dict[ 'metrics'] or not rubicon_dict['filters']: print('Required fields are empty') return pd.DataFrame([]) rubicon_dict['dimensions'] = ','.join(rubicon_dict['dimensions']) rubicon_dict['metrics'] = ','.join(rubicon_dict['metrics']) rubicon_dict['filters'] = ';'.join(rubicon_dict['filters']) except KeyError: print('Required fields are not set') return pd.DataFrame([]) url = urllib.parse.unquote( 'https://api.rubiconproject.com/analytics/v1/report/?account=publisher/{}&' .format(rubicon_id) + urllib.parse.urlencode(rubicon_dict)) p = urllib.request.HTTPPasswordMgrWithDefaultRealm() p.add_password(None, url, username, password) auth_handler = urllib.request.HTTPBasicAuthHandler(p) opener = urllib.request.build_opener(auth_handler) opener.addheaders = [('Accept', 'text/csv')] urllib.request.install_opener(opener) try: result = opener.open(url) except IOError as e: print('Something went wrong, please see error message:') print(e) return pd.DataFrame([]) result_data = result.read().decode() df = pd.read_csv(StringIO(result_data)) return df
def query_athena(query, filename=None): if not input_check(query, [str]): return return_on_exception(filename) if not input_check(filename, [str, type(None)]): return return_on_exception(filename) if filename == '': print('Filename cannot be empty') return return_on_exception(filename) try: s3_bucket = config.get_value('aws', 's3bucket_name') key_id = config.get_value('aws', 'aws_access_key_id') access_key = config.get_value('aws', 'aws_secret_access_key') region = config.get_value('aws', 'aws_region') except (KeyError, NoOptionError) as e: print('No credentials were provided. Error message:') print(e) return return_on_exception(filename) session = boto3.Session( aws_access_key_id=key_id, aws_secret_access_key=access_key ) athena = session.client('athena', region_name=region) s3 = session.resource('s3') if not s3_bucket.startswith('s3://'): output_s3_bucket = 's3://' + s3_bucket else: output_s3_bucket = s3_bucket s3_bucket = s3_bucket.replace('s3://', '') try: result = athena.start_query_execution( QueryString=query, ResultConfiguration={ 'OutputLocation': output_s3_bucket, } ) except ClientError as e: if e.response['Error']['Code'] == 'InvalidRequestException': print("Please check your query. Error message:") else: print('Please check your credentials including s3_bucket in config.ini file. Error message:') print(e) return return_on_exception(filename) except EndpointConnectionError as e: print('Please check your credentials including aws_region in config.ini file and Internet connection.', 'Error message:') print(e) return return_on_exception(filename) query_id = result['QueryExecutionId'] result = poll_status(athena, query_id) if result is None: return return_on_exception(filename) elif result['QueryExecution']['Status']['State'] == 'SUCCEEDED': s3_key = query_id + '.csv' return download_file(s3, s3_bucket, s3_key, filename) else: print('Query did not succeed. Reason:') print(result['QueryExecution']['Status']['StateChangeReason']) return return_on_exception(filename)
def get_data_from_admanager(query, dimensions, columns, start_date, end_date, custom_field_id=None, network_code=None): if not custom_field_id: custom_field_id = [] if not network_code: try: network_code = config.get_value('google_ad_manager', 'network_code') except (KeyError, NoOptionError): print('No network code was provided') return pd.DataFrame([]) yaml_string = "ad_manager: " + "\n" + \ " application_name: " + APPLICATION_NAME + "\n" + \ " network_code: " + str(network_code) + "\n" + \ " path_to_private_key_file: " + KEY_FILE + "\n" # Initialize the GAM client. gam_client = ad_manager.AdManagerClient.LoadFromString(yaml_string) # Create statement object to filter for an order. filter_statement = {'query': query} # Create report job. report_job = { 'reportQuery': { 'dimensions': dimensions, 'statement': filter_statement, 'columns': columns, 'customFieldIds': custom_field_id, 'dateRangeType': 'CUSTOM_DATE', 'startDate': start_date, 'endDate': end_date, 'adUnitView': "FLAT" } } report_downloader = gam_client.GetDataDownloader() try: # Run the report and wait for it to finish. report_job_id = report_downloader.WaitForReport(report_job) except errors.GoogleAdsServerFault as e: if 'AuthenticationError.NETWORK_NOT_FOUND' in str(e): print('Provided network code was not found.') elif 'AuthenticationError.NETWORK_CODE_REQUIRED' in str(e): print('Default value of network code is missing from ', config.default_config_filepath) else: print('Failed to generate report. Error was: {}'.format(e)) return except errors.AdManagerReportError as e: print('Failed to generate report. Error was: {}'.format(e)) return export_format = 'CSV_DUMP' report_file = tempfile.NamedTemporaryFile(suffix='.csv.gz', delete=False) # Download report data. report_downloader.DownloadReportToFile( report_job_id, export_format, report_file, use_gzip_compression=True) report_file.close() # Display results. print('Report job with id {} downloaded to:\n{}'.format( report_job_id, report_file.name)) with gzip.open(report_file.name) as file: data = pd.read_csv(file) return data
import gzip import tempfile from configparser import NoOptionError import pandas as pd from googleads import ad_manager, errors import sroka.config.config as config KEY_FILE = config.get_file_path('google_ad_manager') # GAM API information. try: APPLICATION_NAME = config.get_value('google_ad_manager', 'application_name') except (KeyError, NoOptionError): APPLICATION_NAME = 'Application name' def get_data_from_admanager(query, dimensions, columns, start_date, end_date, custom_field_id=None, network_code=None): if not custom_field_id: custom_field_id = [] if not network_code: try: network_code = config.get_value('google_ad_manager', 'network_code') except (KeyError, NoOptionError): print('No network code was provided') return pd.DataFrame([]) yaml_string = "ad_manager: " + "\n" + \