Exemplo n.º 1
0
def s3_download_data(s3_filename, prefix=False, output_file=None, sep=',', skip_empty_files=True,
                     first_row_columns=True):
    key_id = config.get_value('aws', 'aws_access_key_id')
    access_key = config.get_value('aws', 'aws_secret_access_key')
    session = boto3.Session(
        aws_access_key_id=key_id,
        aws_secret_access_key=access_key
    )

    s3 = session.resource('s3')

    s3_file_pattern = re.compile(r's3://([^/]+)/?(.*)')

    match = s3_file_pattern.match(s3_filename)
    bucket_name = match.group(1)

    key_prefix = match.group(2)

    if type(sep) == str and len(sep) == 1:

        data = _download_data(key_prefix, s3, bucket_name, prefix, sep, skip_empty_files,
                              first_row_columns)

        if output_file:
            data.to_csv(output_file, sep=sep)

        return data

    else:
        print('Separator must be a 1-character string')
Exemplo n.º 2
0
def done_athena(query_id, filename=None):
    try:
        s3_bucket = config.get_value('aws', 's3bucket_name')
        key_id = config.get_value('aws', 'aws_access_key_id')
        access_key = config.get_value('aws', 'aws_secret_access_key')
        region = config.get_value('aws', 'aws_region')
    except (KeyError, NoOptionError):
        print('No credentials were provided')
        return pd.DataFrame([])
    if s3_bucket.startswith('s3://'):
        s3_bucket = s3_bucket.replace('s3://', '')

    session = boto3.Session(aws_access_key_id=key_id,
                            aws_secret_access_key=access_key)

    s3 = session.resource('s3')
    athena = session.client('athena', region_name=region)
    result = poll_status(athena, query_id)
    if result is None:
        return pd.DataFrame([])
    if result['QueryExecution']['Status']['State'] == 'SUCCEEDED':
        s3_key = urlparse(result['QueryExecution']['ResultConfiguration']
                          ['OutputLocation']).path[1:]
        return download_file(s3, s3_bucket, s3_key, filename)
    else:
        print('Query did not succeed')
Exemplo n.º 3
0
def get_data_from_moat(moat_dict, database_name):
    """
    Function that downloads data from MOAT through API to pandas DataFrame.

    Args:
        moat_dict (dict):  dictionary with keys: 'start' (str : str) (start date of analysis 'YYYYMMDD') - obligatory,
                                                 'end' (str : str) (end date of analysis 'YYYYMMDD') - obligatory,
                                                 'columns' (str : list of str) (metrics in list) - obligatory,
                                                 'level1' (str : str) (company specific) - optional,
                                                 'level2' (str : str) (company specific) - optional,
                                                 'level3' (str : str) (company specific) - optional,
                                                 'level4' (str : str) (company specific) - optional
        database_name (str): name of db. Values (names of db and id provided by MOAT) need to be defined in config file

    Returns:
        pandas DataFrame

    Full documentation of MOAT API is available at http://api.moat.com/docs.
    """

    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    if not validate_input_dict(moat_dict):
        return pd.DataFrame([])

    try:
        token = config.get_value('moat', 'token')
    except (KeyError, NoOptionError):
        print('No credentials were provided')
        return pd.DataFrame([])

    try:
        db_id = config.get_value('moat_db', database_name)
    except (KeyError, NoOptionError):
        print('Such database name is not available. Please check config file')
        return pd.DataFrame([])

    moat_dict['columns'] = ','.join(moat_dict['columns'])

    moat_dict['brandId'] = db_id

    http = urllib3.PoolManager()
    auth_header = 'Bearer {}'.format(token)
    resp = http.request('GET',
                        'https://api.moat.com/1/stats.json',
                        fields=moat_dict,
                        headers={'Authorization': auth_header})

    data = json.loads(resp.data)

    if 'error' in data.keys():
        print('Error: ' + data['error'])
        return pd.DataFrame([])

    if data['results']['details'] == [[]]:
        print('Data returned is empty')
        return pd.DataFrame([])

    df = pd.DataFrame(data['results']['details'])
    return df
Exemplo n.º 4
0
def query_athena(query, filename=None):
    try:
        s3_bucket = config.get_value('aws', 's3bucket_name')
        key_id = config.get_value('aws', 'aws_access_key_id')
        access_key = config.get_value('aws', 'aws_secret_access_key')
        region = config.get_value('aws', 'aws_region')
    except (KeyError, NoOptionError):
        print('No credentials were provided')
        return pd.DataFrame([])

    session = boto3.Session(aws_access_key_id=key_id,
                            aws_secret_access_key=access_key)

    athena = session.client('athena', region_name=region)
    s3 = session.resource('s3')
    if not s3_bucket.startswith('s3://'):
        output_s3_bucket = 's3://' + s3_bucket
    else:
        output_s3_bucket = s3_bucket
        s3_bucket = s3_bucket.replace('s3://', '')
    try:
        result = athena.start_query_execution(QueryString=query,
                                              ResultConfiguration={
                                                  'OutputLocation':
                                                  output_s3_bucket,
                                              })
    except ClientError:
        print(
            'Please check your credentials including s3_bucket in config.ini file'
        )
        return pd.DataFrame([])
    except EndpointConnectionError:
        print(
            'Please check your credentials including aws_region in config.ini file'
        )
        return pd.DataFrame([])
    query_id = result['QueryExecutionId']
    result = poll_status(athena, query_id)
    if result['QueryExecution']['Status']['State'] == 'SUCCEEDED':
        s3_key = query_id + '.csv'
        return download_file(s3, s3_bucket, s3_key, filename)
    else:
        print('Query did not succeed')
Exemplo n.º 5
0
def s3_download_data(s3_filename, prefix=False, output_file=None, sep=','):
    key_id = config.get_value('aws', 'aws_access_key_id')
    access_key = config.get_value('aws', 'aws_secret_access_key')
    session = boto3.Session(aws_access_key_id=key_id,
                            aws_secret_access_key=access_key)

    s3 = session.resource('s3')

    s3_file_pattern = re.compile(r's3://([^/]+)/?(.*)')

    match = s3_file_pattern.match(s3_filename)
    bucket_name = match.group(1)

    key_prefix = match.group(2)
    data = _download_data(key_prefix, s3, bucket_name, prefix, sep)

    if output_file:
        data.to_csv(output_file, sep=sep)

    return data
Exemplo n.º 6
0
def done_athena(query_id, filename=None):

    if not input_check(query_id, [str]):
        return return_on_exception(filename)

    if not input_check(filename, [str, type(None)]):
        return return_on_exception(filename)

    try:
        s3_bucket = config.get_value('aws', 's3bucket_name')
        key_id = config.get_value('aws', 'aws_access_key_id')
        access_key = config.get_value('aws', 'aws_secret_access_key')
        region = config.get_value('aws', 'aws_region')
    except (KeyError, NoOptionError) as e:
        print('All or part of credentials were not provided. Please verify config.ini file. Error message:')
        print(e)
        return return_on_exception(filename)

    if s3_bucket.startswith('s3://'):
        s3_bucket = s3_bucket.replace('s3://', '')

    session = boto3.Session(
        aws_access_key_id=key_id,
        aws_secret_access_key=access_key
    )

    s3 = session.resource('s3')
    athena = session.client('athena',
                            region_name=region)
    result = poll_status(athena, query_id)
    if result is None:
        return return_on_exception(filename)
    if result['QueryExecution']['Status']['State'] == 'SUCCEEDED':
        s3_key = urlparse(result['QueryExecution']['ResultConfiguration']['OutputLocation']).path[1:]
        return download_file(s3, s3_bucket, s3_key, filename)
    else:
        print('Query did not succeed. Reason:')
        print(result['QueryExecution']['Status']['StateChangeReason'])
        return return_on_exception(filename)
Exemplo n.º 7
0
def s3_upload_data(data, bucket, path, sep=','):
    key_id = config.get_value('aws', 'aws_access_key_id')
    access_key = config.get_value('aws', 'aws_secret_access_key')
    session = boto3.Session(
        aws_access_key_id=key_id,
        aws_secret_access_key=access_key
    )

    if type(sep) == str and len(sep) == 1:

        csv_buffer = StringIO()

        if type(data) == pd.core.frame.DataFrame or type(data) == np.ndarray:

            if type(data) == pd.core.frame.DataFrame:
                data.to_csv(csv_buffer, sep=sep)
            elif type(data) == np.ndarray:
                np.savetxt(csv_buffer, data, delimiter=sep, fmt='%s')

            s3 = session.resource('s3')
            data = csv_buffer.getvalue()

            try:
                s3.Bucket(bucket).put_object(Key=path, Body=data)
                print('Success. File saved at s3://{}/{}'.format(bucket, path))
            except TypeError:
                print('Bucket name must be a string')
            except ClientError as e:
                if e.response['Error']['Code'] == 'NoSuchBucket':
                    print('The specified bucket does not exist')
            except ParamValidationError as e:
                print(e)

        else:
            print('Uploaded file must be pandas DataFrame or numpy array and not {}'.format(type(data)))

    else:
        print('Separator must be a 1-character string')
Exemplo n.º 8
0
def init_gam_connection(network_code=None):
    if not network_code:
        try:
            network_code = config.get_value('google_ad_manager',
                                            'network_code')
        except (KeyError, NoOptionError):
            print('No network code was provided')
            return pd.DataFrame([])
    yaml_string = "ad_manager: " + "\n" + \
                  "  application_name: " + APPLICATION_NAME + "\n" + \
                  "  network_code: " + str(network_code) + "\n" + \
                  "  path_to_private_key_file: " + KEY_FILE + "\n"

    # Initialize the GAM client.
    gam_client = ad_manager.AdManagerClient.LoadFromString(yaml_string)
    return gam_client
Exemplo n.º 9
0
def get_options_from_config():
    # Set the options in a dictionary, in order to pass only the
    # options that were provided in the configuration file to the
    # MySQL connector. Passing empty values would trigger exceptions.
    options = dict()

    try:
        options["host"] = config.get_value('mysql', 'host')
    except (KeyError, NoOptionError):  # Do nothing, this value is optional.
        pass

    try:
        options["port"] = config.get_value('mysql', 'port')
    except (KeyError, NoOptionError):  # Do nothing, this value is optional.
        pass

    try:
        options["user"] = config.get_value('mysql', 'user')
    except (KeyError, NoOptionError):  # Do nothing, this value is optional.
        pass

    try:
        options["password"] = config.get_value('mysql', 'password')
    except (KeyError, NoOptionError):  # Do nothing, this value is optional.
        pass

    try:
        options["unix_socket"] = config.get_value('mysql', 'unix_socket')
    except (KeyError, NoOptionError):  # Do nothing, this value is optional.
        pass

    try:
        options["database"] = config.get_value('mysql', 'database')
    except (KeyError, NoOptionError):  # Do nothing, this value is optional.
        pass

    return options
Exemplo n.º 10
0
def get_data_from_rubicon(rubicon_dict, currency='USD'):
    """
    Function that download data from Rubicon db through API to pandas DataFrame.

    Args:
        rubicon_dict (dict):  dictionary with keys:
            'start' (str : str) (Start date in ISO-8601 format, including time zone.) - obligatory,
            'end' (str : str) (End date in ISO-8601 format, including time zone.) - obligatory,
            'dimensions' (str : list of str) (dimensions that we want included as columns in list) - obligatory,
            'metrics' (str : list of str) (metrics that we want included as columns in list) - obligatory
            'filters' (str: list of str) (filters that we want to be included in list) - obligatory
        currency (str): currency to be used

    Returns:
        pandas DataFrame

    Full documentation (additional features) of Rubicon API is available at:
        https://resources.rubiconproject.com/resource/publisher-resources/performance-analytics-api/
        (you need to be logged in with provided credentials)
    """

    try:
        username = config.get_value('rubicon', 'username')
        password = config.get_value('rubicon', 'password')
        rubicon_id = config.get_value('rubicon', 'id')
    except (KeyError, NoOptionError):
        print('No credentials were provided')
        return pd.DataFrame([])

    if 'start' not in rubicon_dict.keys() or 'end' not in rubicon_dict.keys():
        print('Required fields are not set')
        return pd.DataFrame([])

    rubicon_dict['currency'] = currency

    try:
        if not rubicon_dict['dimensions'] or not rubicon_dict[
                'metrics'] or not rubicon_dict['filters']:
            print('Required fields are empty')
            return pd.DataFrame([])
        rubicon_dict['dimensions'] = ','.join(rubicon_dict['dimensions'])
        rubicon_dict['metrics'] = ','.join(rubicon_dict['metrics'])
        rubicon_dict['filters'] = ';'.join(rubicon_dict['filters'])
    except KeyError:
        print('Required fields are not set')
        return pd.DataFrame([])

    url = urllib.parse.unquote(
        'https://api.rubiconproject.com/analytics/v1/report/?account=publisher/{}&'
        .format(rubicon_id) + urllib.parse.urlencode(rubicon_dict))

    p = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    p.add_password(None, url, username, password)

    auth_handler = urllib.request.HTTPBasicAuthHandler(p)

    opener = urllib.request.build_opener(auth_handler)

    opener.addheaders = [('Accept', 'text/csv')]

    urllib.request.install_opener(opener)

    try:
        result = opener.open(url)

    except IOError as e:
        print('Something went wrong, please see error message:')
        print(e)
        return pd.DataFrame([])

    result_data = result.read().decode()
    df = pd.read_csv(StringIO(result_data))
    return df
Exemplo n.º 11
0
def query_athena(query, filename=None):

    if not input_check(query, [str]):
        return return_on_exception(filename)

    if not input_check(filename, [str, type(None)]):
        return return_on_exception(filename)

    if filename == '':
        print('Filename cannot be empty')
        return return_on_exception(filename)

    try:
        s3_bucket = config.get_value('aws', 's3bucket_name')
        key_id = config.get_value('aws', 'aws_access_key_id')
        access_key = config.get_value('aws', 'aws_secret_access_key')
        region = config.get_value('aws', 'aws_region')
    except (KeyError, NoOptionError) as e:
        print('No credentials were provided. Error message:')
        print(e)
        return return_on_exception(filename)

    session = boto3.Session(
        aws_access_key_id=key_id,
        aws_secret_access_key=access_key
    )

    athena = session.client('athena',
                            region_name=region)
    s3 = session.resource('s3')
    if not s3_bucket.startswith('s3://'):
        output_s3_bucket = 's3://' + s3_bucket
    else:
        output_s3_bucket = s3_bucket
        s3_bucket = s3_bucket.replace('s3://', '')
    try:
        result = athena.start_query_execution(
            QueryString=query,
            ResultConfiguration={
                'OutputLocation': output_s3_bucket,
            }
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'InvalidRequestException':
            print("Please check your query. Error message:")
        else:
            print('Please check your credentials including s3_bucket in config.ini file. Error message:')
        print(e)
        return return_on_exception(filename)

    except EndpointConnectionError as e:
        print('Please check your credentials including aws_region in config.ini file and Internet connection.',
              'Error message:')
        print(e)
        return return_on_exception(filename)

    query_id = result['QueryExecutionId']
    result = poll_status(athena, query_id)
    if result is None:
        return return_on_exception(filename)

    elif result['QueryExecution']['Status']['State'] == 'SUCCEEDED':
        s3_key = query_id + '.csv'
        return download_file(s3, s3_bucket, s3_key, filename)
    else:
        print('Query did not succeed. Reason:')
        print(result['QueryExecution']['Status']['StateChangeReason'])
        return return_on_exception(filename)
Exemplo n.º 12
0
def get_data_from_admanager(query, dimensions, columns, start_date, end_date, custom_field_id=None, network_code=None):

    if not custom_field_id:
        custom_field_id = []

    if not network_code:
        try:
            network_code = config.get_value('google_ad_manager', 'network_code')
        except (KeyError, NoOptionError):
            print('No network code was provided')
            return pd.DataFrame([])

    yaml_string = "ad_manager: " + "\n" + \
        "  application_name: " + APPLICATION_NAME + "\n" + \
        "  network_code: " + str(network_code) + "\n" + \
        "  path_to_private_key_file: " + KEY_FILE + "\n"

    # Initialize the GAM client.
    gam_client = ad_manager.AdManagerClient.LoadFromString(yaml_string)

    # Create statement object to filter for an order.

    filter_statement = {'query': query}

    # Create report job.
    report_job = {
      'reportQuery': {
          'dimensions': dimensions,
          'statement': filter_statement,
          'columns': columns,
          'customFieldIds': custom_field_id,
          'dateRangeType': 'CUSTOM_DATE',
          'startDate': start_date,
          'endDate': end_date,
          'adUnitView': "FLAT"
      }
    }

    report_downloader = gam_client.GetDataDownloader()

    try:
        # Run the report and wait for it to finish.
        report_job_id = report_downloader.WaitForReport(report_job)
    except errors.GoogleAdsServerFault as e:
        if 'AuthenticationError.NETWORK_NOT_FOUND' in str(e):
            print('Provided network code was not found.')
        elif 'AuthenticationError.NETWORK_CODE_REQUIRED' in str(e):
            print('Default value of network code is missing from ', config.default_config_filepath)
        else:
            print('Failed to generate report. Error was: {}'.format(e))
        return

    except errors.AdManagerReportError as e:
        print('Failed to generate report. Error was: {}'.format(e))
        return

    export_format = 'CSV_DUMP'

    report_file = tempfile.NamedTemporaryFile(suffix='.csv.gz', delete=False)

    # Download report data.
    report_downloader.DownloadReportToFile(
        report_job_id, export_format, report_file, use_gzip_compression=True)

    report_file.close()

    # Display results.
    print('Report job with id {} downloaded to:\n{}'.format(
        report_job_id, report_file.name))
    with gzip.open(report_file.name) as file:
        data = pd.read_csv(file)
    return data
Exemplo n.º 13
0
import gzip
import tempfile
from configparser import NoOptionError

import pandas as pd
from googleads import ad_manager, errors

import sroka.config.config as config

KEY_FILE = config.get_file_path('google_ad_manager')

# GAM API information.
try:
    APPLICATION_NAME = config.get_value('google_ad_manager', 'application_name')
except (KeyError, NoOptionError):
    APPLICATION_NAME = 'Application name'


def get_data_from_admanager(query, dimensions, columns, start_date, end_date, custom_field_id=None, network_code=None):

    if not custom_field_id:
        custom_field_id = []

    if not network_code:
        try:
            network_code = config.get_value('google_ad_manager', 'network_code')
        except (KeyError, NoOptionError):
            print('No network code was provided')
            return pd.DataFrame([])

    yaml_string = "ad_manager: " + "\n" + \