Пример #1
0
import time

import requests
from datapackage_pipelines.wrapper import ingest, spew

from datapackage_pipelines_measure.config import settings

import logging
log = logging.getLogger(__name__)

parameters, datapackage, res_iter = ingest()

# 30 authenticated requests per minute, so wait 3 secs (or use
# GITHUB_REQUEST_WAIT_INTERVAL env var) before each request
# (https://developer.github.com/v3/search/#rate-limit)
REQUEST_WAIT_INTERVAL = int(settings.get('GITHUB_REQUEST_WAIT_INTERVAL', 3))


def _make_github_request(url):
    try:
        headers = {
            'Authorization': 'token {}'.format(settings['GITHUB_API_TOKEN'])
        }
        response = requests.get(url, headers=headers)
        json_response = response.json()
    except simplejson.scanner.JSONDecodeError:
        log.error('Expected JSON in response from: {}'.format(url))
        raise

    if response.status_code != 200:
        log.error('Response from Github not successful')
Пример #2
0
def add_steps(steps: list, pipeline_id: str,
              project_id: str, config: dict) -> list:

    steps.append(('measure.datastore_get_latest', {
        'resource-name': 'latest-project-entries',
        'table': 'forum_categories',
        'engine': settings.get('DB_ENGINE'),
        'distinct_on': ['project_id', 'domain', 'source', 'category']
    }))

    for domain_categories in config['discourse-categories']:
        for category in domain_categories['categories']:
            steps.append(('measure.add_discourse_category_resource', {
                'category': category,
                'domain': domain_categories['domain']
            }))

    steps.append(('measure.remove_resource', {
        'name': 'latest-project-entries'
    }))

    steps.append(('concatenate', {
        'target': {
            'name': 'forum-categories',
            'path': 'data/forum-categories.json'},
        'fields': {
            'domain': [],
            'category': [],
            'new_topics': [],
            'new_posts': [],
            'source': [],
            'date': []}
    }))

    steps.append(('set_types', {
        'types': {
            'domain': {
                'type': 'string',
            },
            'category': {
                'type': 'string',
            },
            'source': {
                'type': 'string',
            },
            'new_topics': {
                'type': 'integer'
            },
            'new_posts': {
                'type': 'integer'
            },
            'date': {
                'type': 'date',
            },
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings['DB_ENGINE'],
        'tables': {
            'forum_categories': {
                'resource-name': 'forum-categories',
                'mode': 'update',
                'update_keys': ['domain', 'category', 'source',
                                'project_id', 'date']
            }
        }
    }))

    return steps
Пример #3
0
def add_steps(steps: list, pipeline_id: str, project_id: str,
              config: dict) -> list:

    steps.append(('measure.datastore_get_latest', {
        'resource-name': 'latest-project-entries',
        'table': 'websiteanalytics',
        'engine': settings.get('DB_ENGINE'),
        'distinct_on': ['project_id', 'domain', 'source']
    }))

    if 'ga' in config:
        for domain in config['ga']['domains']:
            steps.append(('measure.add_ga_resource', {'domain': domain}))

    steps.append(('measure.remove_resource', {
        'name': 'latest-project-entries'
    }))

    steps.append(('concatenate', {
        'target': {
            'name': 'website-analytics',
            'path': 'data/website-analytics.json'
        },
        'fields': {
            'domain': [],
            'page_path': [],
            'visitors': [],
            'unique_visitors': [],
            'avg_time_spent': [],
            'source': [],
            'date': []
        }
    }))

    steps.append(('set_types', {
        'types': {
            'domain': {
                'type': 'string',
            },
            'page_path': {
                'type': 'string',
            },
            'visitors': {
                'type': 'integer'
            },
            'unique_visitors': {
                'type': 'integer'
            },
            'avg_time_spent': {
                'type': 'number'
            },
            'date': {
                'type': 'date',
            },
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings['DB_ENGINE'],
        'tables': {
            'websiteanalytics': {
                'resource-name':
                'website-analytics',
                'mode':
                'update',
                'update_keys': [
                    'domain',
                    'page_path',
                    'source',
                    'project_id',
                    'date',
                ]
            }
        }
    }))

    return steps
Пример #4
0
def add_steps(steps: list, pipeline_id: str, project_id: str,
              config: dict) -> list:

    steps.append(('measure.datastore_get_latest', {
        'resource-name': 'latest-project-entries',
        'table': 'email',
        'engine': settings.get('DB_ENGINE'),
        'distinct_on': ['project_id', 'source', 'list_id']
    }))

    if 'mailchimp' in config:
        for list_id in config['mailchimp']['lists']:
            steps.append(('measure.add_mailchimp_resource', {
                'list_id': list_id
            }))

    steps.append(('measure.remove_resource', {
        'name': 'latest-project-entries'
    }))

    steps.append(('concatenate', {
        'target': {
            'name': 'email',
            'path': 'data/email.csv'
        },
        'fields': {
            'source': [],
            'list_id': [],
            'date': [],
            'subscribers': [],
            'subs': [],
            'unsubs': [],
            'campaigns_sent': []
        }
    }))

    steps.append(('set_types', {
        'types': {
            'source': {
                'type': 'string'
            },
            'list_id': {
                'type': 'string'
            },
            'date': {
                'type': 'date'
            },
            'subscribers': {
                'type': 'integer'
            },
            'subs': {
                'type': 'integer'
            },
            'unsubs': {
                'type': 'integer'
            },
            'campaigns_sent': {
                'type': 'integer'
            }
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings.get('DB_ENGINE'),
        'tables': {
            'email': {
                'resource-name': 'email',
                'mode': 'update',
                'update_keys': ['date', 'source', 'list_id', 'project_id']
            }
        }
    }))

    return steps
Пример #5
0
def add_steps(steps: list, pipeline_id: str, project_id: str,
              config: dict) -> list:

    steps.append(('measure.datastore_get_latest', {
        'resource-name': 'latest-project-entries',
        'table': 'codepackaging',
        'engine': settings.get('DB_ENGINE'),
        'distinct_on': ['project_id', 'package', 'source']
    }))

    if 'npm' in config:
        for package in config['npm']['packages']:
            steps.append(('measure.add_npm_resource', {
                'package': slugify(package)
            }))

    if 'pypi' in config:
        for package in config['pypi']['packages']:
            steps.append(('measure.add_pypi_resource', {
                'package': slugify(package)
            }))

    if 'rubygems' in config:
        for gem in config['rubygems']['gems']:
            steps.append(('measure.add_rubygems_resource', {'gem_id': gem}))

    if 'packagist' in config:
        for package in config['packagist']['packages']:
            steps.append(('measure.add_packagist_resource', {
                'package': package
            }))

    steps.append(('measure.remove_resource', {
        'name': 'latest-project-entries'
    }))

    steps.append(('concatenate', {
        'target': {
            'name': 'code-packaging',
            'path': 'data/code-packaging.csv'
        },
        'fields': {
            'date': [],
            'downloads': [],
            'total_downloads': [],
            'source': [],
            'package': []
        }
    }))

    steps.append(('set_types', {
        'types': {
            'downloads': {
                'type': 'integer'
            },
            'total_downloads': {
                'type': 'integer'
            },
            'source': {
                'type': 'string'
            },
            'date': {
                'type': 'date'
            },
            'package': {
                'type': 'string'
            }
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings.get('DB_ENGINE'),
        'tables': {
            'codepackaging': {
                'resource-name': 'code-packaging',
                'mode': 'update',
                'update_keys': ['project_id', 'date', 'package', 'source']
            }
        }
    }))

    return steps
Пример #6
0
def add_steps(steps: list, pipeline_id: str,
              project_id: str, config: dict) -> list:
    for repo in config['github']['repositories']:
        steps.append(('measure.add_github_resource', {
            'name': slugify(repo),
            'repo': repo,
            'map_fields': {
                'repository': 'name',
                'watchers': 'subscribers_count',
                'stars': 'stargazers_count'
            }
        }))

    steps.append(('concatenate', {
        'sources':
            [slugify(repo) for repo in config['github']['repositories']],
        'target': {
            'name': 'code-hosting',
            'path': 'data/code-hosting.json'},
        'fields': {
            'repository': [],
            'watchers': [],
            'stars': [],
            'source': [],
            'date': []}
    }))

    steps.append(('set_types', {
        'types': {
            'repository': {
                'type': 'string',
            },
            'watchers': {
                'type': 'integer'
            },
            'stars': {
                'type': 'integer'
            },
            'date': {
                'type': 'date',
            },
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings['DB_ENGINE'],
        'tables': {
            'codehosting': {
                'resource-name': 'code-hosting',
                'mode': 'update',
                'update_keys': ['repository', 'source', 'project_id', 'date']
            }
        }
    }))

    return steps
Пример #7
0
def add_steps(steps: list, pipeline_id: str, project_id: str,
              config: dict) -> list:

    if 'twitter' in config:
        for entity in config['twitter']['entities']:
            steps.append(('measure.add_twitter_resource', {
                'entity': entity,
                'project_id': project_id
            }))

    if 'facebook' in config:
        for page in config['facebook']['pages']:
            steps.append(('measure.add_facebook_resource', {
                'entity': page,
                'project_id': project_id
            }))

    steps.append(('concatenate', {
        'target': {
            'name': 'social-media',
            'path': 'data/social-media.csv'
        },
        'fields': {
            'entity': [],
            'entity_type': [],
            'source': [],
            'date': [],
            'followers': [],
            'mentions': [],
            'interactions': [],
            'impressions': []
        }
    }))

    steps.append(('set_types', {
        'types': {
            'entity': {
                'type': 'string',
            },
            'entity_type': {
                'type': 'string'
            },
            'source': {
                'type': 'string'
            },
            'date': {
                'type': 'date',
            },
            'followers': {
                'type': 'integer'
            },
            'mentions': {
                'type': 'integer'
            },
            'interactions': {
                'type': 'integer'
            },
            'impressions': {
                'type': 'integer'
            }
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings.get('DB_ENGINE'),
        'tables': {
            'socialmedia': {
                'resource-name':
                'social-media',
                'mode':
                'update',
                'update_keys':
                ['entity', 'entity_type', 'source', 'project_id', 'date']
            }
        }
    }))

    return steps
Пример #8
0
def add_steps(steps: list, pipeline_id: str,
              project_id: str, config: dict) -> list:

    steps.append(('measure.datastore_get_latest', {
        'resource-name': 'latest-project-entries',
        'table': 'outputs',
        'engine': settings.get('DB_ENGINE'),
        'distinct_on': ['project_id', 'source', 'source_id'],
        'sort_date_key': 'source_timestamp'
    }))

    for source in config:
        steps.append(('measure.add_outputs_resource', {
            'sheet_id': source.get('sheetid'),
            'gid': source.get('gid'),
            'source_type': source.get('type')
        }))

    steps.append(('measure.remove_resource', {
        'name': 'latest-project-entries'
    }))

    steps.append(('concatenate', {
        'target': {
            'name': 'outputs',
            'path': 'data/outputs.csv'},
        'fields': {
            'source_id': [],
            'source_type': [],
            'source': [],
            'source_timestamp': [],
            'source_email': [],
            'output_title': [],
            'output_type': [],
            'output_organization': [],
            'output_person': [],
            'output_link': [],
            'output_additional_information': [],
            'output_date': []}
    }))

    steps.append(('set_types', {
        'types': {
            'source_id': {
                'type': 'string'
            },
            'source_type': {
                'type': 'string'
            },
            'source': {
                'type': 'string'
            },
            'source_timestamp': {
                'type': 'datetime'
            },
            'source_email': {
                'type': 'string'
            },
            'output_title': {
                'type': 'string'
            },
            'output_organization': {
                'type': 'string'
            },
            'output_person': {
                'type': 'string'
            },
            'output_link': {
                'type': 'string'
            },
            'output_additional_information': {
                'type': 'string'
            },
            'output_date': {
                'type': 'date'
            }}
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings.get('DB_ENGINE'),
        'tables': {
            'outputs': {
                'resource-name': 'outputs',
                'mode': 'update',
                'update_keys': ['project_id', 'source', 'source_timestamp',
                                'source_id']
            }
        }
    }))

    return steps