def test_upload_new_file(self):
        """Update data to a new file."""
        ps = PersistenceStore(s3_client=S3NewUpload())

        try:
            ps.update({}, 'bucket_name', 'filename.json')
        except Exception:
            assert False, 'Exception raised'
    def test_upload_existing_file(self):
        """Upload data in S3 with existing data."""
        ps = PersistenceStore(s3_client=S3ExistingUpload())

        try:
            ps.update({'test': 'super cool'}, 'bucket_name', 'filename.json')
        except Exception:
            assert False, 'Exception raised'
    def test_upload_no_connection(self):
        """Test no connection use case."""
        ps = PersistenceStore(s3_client=S3NotConnected())

        with pytest.raises(Exception) as e:
            ps.update({}, 'bucket_name', 'filename.json')

        assert str(e.value) == 'Unable to connect to s3.'
예제 #4
0
    def __init__(self):
        """Initialize the BigQueryDataProcessing object."""
        self.big_query = Bigquery()

        self.collectors = {}
        for ecosystem in ECOSYSTEM_MANIFEST_MAP.keys():
            self.collectors[ecosystem] = self._get_collector(ecosystem)

        self.data_store = PersistenceStore()
예제 #5
0
    def test_upload_existing_empty_file(self):
        """Upload new data with empty data in existing file."""
        ps = PersistenceStore(s3_client=S3ExistingEmptyUpload())

        with pytest.raises(Exception) as e:
            ps.update({}, 'filename.json')

        assert str(e.value) == 'Unable to get the json data path: ' \
                               'developer-analytics-audit-report/filename.json'
    def test_upload_existing_empty_file(self):
        """Upload new data with empty data in existing file."""
        ps = PersistenceStore(s3_client=S3ExistingEmptyUpload())

        with pytest.raises(Exception) as e:
            ps.update({}, 'bucket_name', 'filename.json')

        assert str(
            e.value
        ) == 'Unable to get the json data path:bucket_name/filename.json'
    def __init__(self):
        """Initialize the BigQueryDataProcessing object."""
        self.ecosystemBatchData = {}
        self.ecosystemContentData = {}
        self.collectors = {}
        for ecosystem in ECOSYSTEM_MANIFEST_MAP.keys():
            self.collectors[ecosystem] = self._get_collector(ecosystem)
            self.ecosystemContentData[ecosystem] = {'size': 0, 'count': 0}
            self.ecosystemBatchData[ecosystem] = {'batch_index': 1, 'size': 0}

        self.data_store = PersistenceStore()
예제 #8
0
    def test_upload_existing_file(self):
        """Upload data in S3 with existing data."""
        ps = PersistenceStore(s3_client=S3ExistingUpload())

        try:
            new_data = {
                'maven': {
                    'pck1, pck2, pck3': 7,
                    'pck30, pck6': 20,
                    'pck2, pck4, pck7': 12
                },
                'npm': {
                    'pck1, pck2, pck3': 45,
                    'pck77': 23,
                    'pck2, pck4, pck7': 99
                },
                'pypi': {
                    'pck3, pck56': 65,
                    'pck2, pck4, pck7': 110
                }
            }
            ps.update(new_data, 'filename.json')
        except Exception:
            assert False, 'Exception raised'
class DataJob():
    """Big query data fetching and processing class."""
    def __init__(self):
        """Initialize the BigQueryDataProcessing object."""
        self.ecosystemBatchData = {}
        self.ecosystemContentData = {}
        self.collectors = {}
        for ecosystem in ECOSYSTEM_MANIFEST_MAP.keys():
            self.collectors[ecosystem] = self._get_collector(ecosystem)
            self.ecosystemContentData[ecosystem] = {'size': 0, 'count': 0}
            self.ecosystemBatchData[ecosystem] = {'batch_index': 1, 'size': 0}

        self.data_store = PersistenceStore()

    def run(self):
        """Get big query data and update manifest data."""
        # Cleanup s3 before start, in case last run was not completed due to error.
        self._cleanup_s3()

        bq_start = time.monotonic()
        self._get_big_query_data()
        bq_end = time.monotonic()

        parse_start = time.monotonic()
        self._parse()
        parse_end = time.monotonic()

        self._cleanup_s3()

        logger.info('Ecosystem wise content data: %s',
                    self.ecosystemContentData)
        logger.info('Ecosystem wise batch information: %s',
                    self.ecosystemBatchData)
        logger.info('Big query data download took %0.2f seconds',
                    bq_end - bq_start)
        logger.info('Data parsing took %0.2f seconds', parse_end - parse_start)

    def _parse(self):
        """Parse all ecosystem data."""
        s3_objects = self.data_store.list_bucket_objects(prefix=S3_TEMP_FOLDER)
        index = 0
        for s3_object in s3_objects:
            object_key = s3_object.key
            logger.info('Parsing S3 object %s', object_key)

            # Skip folder objects and other files that are not .zip
            if not object_key.endswith('.zip'):
                continue

            # Extract ecosystem
            ecosystem = None
            if object_key.startswith('{}/npm'.format(S3_TEMP_FOLDER)):
                ecosystem = 'npm'
            elif object_key.startswith('{}/maven'.format(S3_TEMP_FOLDER)):
                ecosystem = 'maven'
            elif object_key.startswith('{}/pypi'.format(S3_TEMP_FOLDER)):
                ecosystem = 'pypi'

            if not ecosystem:
                logger.warning(
                    'Could not find ecosystem for given object_key %s',
                    object_key)
                continue

            index += 1

            # Create unzip directory
            unzip_dir = '{}/{}_unzip_dir/'.format(
                SETTINGS.local_working_directory, index)
            if not os.path.exists(unzip_dir):
                os.makedirs(unzip_dir)

            # Download zip content
            download_zip_path = '{}{}_downloaded.zip'.format(unzip_dir, index)
            self.data_store.download_file(object_key, download_zip_path)

            # Extract zip content
            unpack_archive(download_zip_path, unzip_dir, 'zip')

            # Loop through extract manifest files
            manifest_dir_path = '{}{}/'.format(unzip_dir, ecosystem)
            manifest_files = [
                manifest_dir_path + f for f in os.listdir(manifest_dir_path)
            ]

            for manifest_file in manifest_files:
                if manifest_file.endswith(ECOSYSTEM_MANIFEST_MAP[ecosystem]):
                    with open(manifest_file, 'r') as fp:
                        content = fp.read()
                        logger.info('%d. Parsing file: %s', index,
                                    manifest_file)
                        self.collectors[ecosystem].parse_and_collect(
                            content, True)
                else:
                    logger.warning('Skipping non-manifest file %s',
                                   manifest_file)

            rmtree(unzip_dir)
            logger.debug(f'Removed local unzip dir {unzip_dir}')

        self._update_s3()

    def _get_big_query_data(self):
        """Process Bigquery response data."""
        big_query = Bigquery()

        start = time.monotonic()
        index = 0

        # Create local structure to store content
        for _ecosystem, _ in ECOSYSTEM_MANIFEST_MAP.items():
            dir = '{}/{}/'.format(SETTINGS.local_working_directory, _ecosystem)
            if not os.path.exists(dir):
                os.makedirs(dir)
                print(f'Created dir {dir}')

        big_query.run(self._get_big_query())
        for object in big_query.get_result():
            index += 1

            path = object.get('path', None)
            content = object.get('content', None)

            if not path or not content:
                logger.warning('Either path %s or content %s is null', path,
                               content)
                continue

            ecosystem = None
            for _ecosystem, manifest in ECOSYSTEM_MANIFEST_MAP.items():
                if path.endswith(manifest):
                    ecosystem = _ecosystem

            if not ecosystem:
                logger.warning('Could not find ecosystem for given path %s',
                               path)
                continue

            if index % 1000 == 0:
                logger.info('[%d] Time lapsed: %f Processing path: %s', index,
                            time.monotonic() - start, path)

            contentSize = len(content)
            self.ecosystemContentData[ecosystem]['size'] += contentSize
            self.ecosystemContentData[ecosystem]['count'] += 1

            filename = '{}/{}/{}_{}'.format(
                SETTINGS.local_working_directory, ecosystem,
                self.ecosystemContentData[ecosystem]['count'],
                path.split('/')[-1])
            with open(filename, 'w') as fp:
                fp.write(content)
            self.ecosystemBatchData[ecosystem]['size'] += contentSize

            if self.ecosystemBatchData[ecosystem]['size'] > CONTENT_BATCH_SIZE:
                self._upload_batch_data(ecosystem)

        # Finally upload incomplete batches
        for ecosystem, _ in ECOSYSTEM_MANIFEST_MAP.items():
            if self.ecosystemBatchData[ecosystem]['size'] > 0:
                self._upload_batch_data(ecosystem)

        logger.info('Processed %d manifests, ecosystem data: %s', index,
                    self.ecosystemContentData)

    def _upload_batch_data(self, ecosystem):
        # Compress the current content, delete the content and reset batch size.
        compressFileName = '{}/{}/{}_{}'.format(
            SETTINGS.local_working_directory, ecosystem,
            self.ecosystemBatchData[ecosystem]['batch_index'], ecosystem)
        make_archive(compressFileName,
                     'zip',
                     root_dir=SETTINGS.local_working_directory,
                     base_dir=ecosystem)

        compressFileName = compressFileName + '.zip'
        filename = '{}/{}/{}_{}.zip'.format(
            S3_TEMP_FOLDER, ecosystem,
            self.ecosystemBatchData[ecosystem]['batch_index'], ecosystem)
        self.data_store.upload_file(compressFileName, filename)

        dir = '{}/{}/'.format(SETTINGS.local_working_directory, ecosystem)
        rmtree(dir)
        os.makedirs(dir)

        self.ecosystemBatchData[ecosystem]['batch_index'] += 1
        self.ecosystemBatchData[ecosystem]['size'] = 0

        logger.debug('Processed batch %d, starting new batch %d',
                     self.ecosystemBatchData[ecosystem]["batch_index"] - 1,
                     self.ecosystemBatchData[ecosystem]["batch_index"])

    def _cleanup_s3(self):
        try:
            self.data_store.s3_delete_folder(S3_TEMP_FOLDER)
        except Exception as e:
            logger.warning('Exception :: Cleaning s3 %s throws %s',
                           S3_TEMP_FOLDER, str(e))

    def _get_big_query(self) -> str:
        return """
            SELECT con.content AS content, L.path AS path
            FROM `bigquery-public-data.github_repos.contents` AS con
            INNER JOIN (
                SELECT files.id AS id, files.path as path
                FROM `bigquery-public-data.github_repos.languages` AS langs
                INNER JOIN `bigquery-public-data.github_repos.files` AS files
                ON files.repo_name = langs.repo_name
                    WHERE (
                        (
                            REGEXP_CONTAINS(TO_JSON_STRING(language), r'(?i)java') AND
                            files.path LIKE '%/{m}'
                        ) OR
                        (
                            REGEXP_CONTAINS(TO_JSON_STRING(language), r'(?i)python') AND
                            files.path LIKE '%/{p}'
                        ) OR
                        (
                            files.path LIKE '%/{n}'
                        )
                    )
            ) AS L
            ON con.id = L.id;
        """.format(m=ECOSYSTEM_MANIFEST_MAP['maven'],
                   p=ECOSYSTEM_MANIFEST_MAP['pypi'],
                   n=ECOSYSTEM_MANIFEST_MAP['npm'])

    def _get_collector(self, ecosystem) -> BaseCollector:
        if ecosystem == 'maven':
            return MavenCollector()

        if ecosystem == 'npm':
            return NpmCollector()

        if ecosystem == 'pypi':
            return PypiCollector()

    def _update_s3(self):
        logger.info('Updating file content to S3')
        data = {}
        for ecosystem, object in self.collectors.items():
            data[ecosystem] = dict(object.counter.most_common())

        filename = 'big-query-data/{}'.format(
            AWS_SETTINGS.s3_collated_filename)

        self.data_store.update(data=data, filename=filename)

        logger.info('Succefully saved BigQuery data to persistance store')
 def test_init_with_client(self):
     """Test init without client."""
     ps = PersistenceStore(s3_client=S3NotConnected())
     assert ps.s3_client is not None
 def test_init(self, _s3):
     """Test init without client."""
     ps = PersistenceStore(s3_client=None)
     assert ps.s3_client is not None
예제 #12
0
class DataJob():
    """Big query data fetching and processing class."""

    def __init__(self):
        """Initialize the BigQueryDataProcessing object."""
        self.big_query = Bigquery()

        self.collectors = {}
        for ecosystem in ECOSYSTEM_MANIFEST_MAP.keys():
            self.collectors[ecosystem] = self._get_collector(ecosystem)

        self.data_store = PersistenceStore()

    def run(self):
        """Process Bigquery response data."""
        start = time.monotonic()
        index = 0
        logger.info('Running Bigquery synchronously')
        self.big_query.run(self._get_big_query())
        for object in self.big_query.get_result():
            index += 1

            path = object.get('path', None)
            content = object.get('content', None)

            if not path or not content:
                logger.warning('Either path %s or content %s is null', path, content)
                continue

            ecosystem = None
            for _ecosystem, manifest in ECOSYSTEM_MANIFEST_MAP.items():
                if path.endswith(manifest):
                    ecosystem = _ecosystem

            if not ecosystem:
                logger.warning('Could not find ecosystem for given path %s', path)
                continue

            self.collectors[ecosystem].parse_and_collect(content, True)

        logger.info('Processed %d manifests in time: %f', index, time.monotonic() - start)
        self._update_s3()

    def _get_big_query(self) -> str:
        return """
            SELECT con.content AS content, L.path AS path
            FROM `bigquery-public-data.github_repos.contents` AS con
            INNER JOIN (
                SELECT files.id AS id, files.path as path
                FROM `bigquery-public-data.github_repos.languages` AS langs
                INNER JOIN `bigquery-public-data.github_repos.files` AS files
                ON files.repo_name = langs.repo_name
                    WHERE (
                        (
                            REGEXP_CONTAINS(TO_JSON_STRING(language), r'(?i)java') AND
                            files.path LIKE '%{m}'
                        ) OR
                        (
                            REGEXP_CONTAINS(TO_JSON_STRING(language), r'(?i)python') AND
                            files.path LIKE '%{p}'
                        ) OR
                        (
                            files.path LIKE '%{n}'
                        )
                    )
            ) AS L
            ON con.id = L.id;
        """.format(m=ECOSYSTEM_MANIFEST_MAP['maven'],
                   p=ECOSYSTEM_MANIFEST_MAP['pypi'],
                   n=ECOSYSTEM_MANIFEST_MAP['npm'])

    def _get_collector(self, ecosystem) -> BaseCollector:
        if ecosystem == 'maven':
            return MavenCollector()

        if ecosystem == 'npm':
            return NpmCollector()

        if ecosystem == 'pypi':
            return PypiCollector()

    def _update_s3(self):
        logger.info('Updating file content to S3')
        data = {}
        for ecosystem, object in self.collectors.items():
            data[ecosystem] = dict(object.counter.most_common())

        filename = 'big-query-data/{}'.format(AWS_SETTINGS.s3_collated_filename)

        self.data_store.update(data=data,
                               bucket_name=AWS_SETTINGS.s3_bucket_name,
                               filename=filename)

        logger.info('Succefully saved BigQuery data to persistance store')