def import_bulk(data_source, book_keeper): """ Imports bulk data from the given data source. It can perform both 'full import' as well as 'incremental update'. :param data_source: Data source to read input from :param book_keeper: Book keeper to get info about recently ingested data :return: None """ try: # Now, get the last incremental update timestamp from the graph. graph_meta = GraphPopulator.get_metadata() # If the timestamp is unknown then it means graph is not populated yet and we need to do full import. list_keys = [] if graph_meta is None: # Collect all the files from data-source and group them by package-version. logger.debug("Performing full import. Fetching all objects from : " + data_source.get_source_name()) list_keys = data_source.list_files() # else if the timestamp is available then we need to perform incremental update. else: if book_keeper is None: raise RuntimeError("Cannot perform incremental update without book keeper!") # Collect all the package-version from RDS table that were updated recently. # Note: If RDS table is unreachable then we should still live with S3 data. min_finished_at = graph_meta.last_incr_update_ts list_epv = book_keeper.get_recent_epv(min_finished_at) # Collect relevant files from data-source and group them by package-version. logger.debug("Performing incremental update. Fetching some objects from : " + data_source.get_source_name()) for epv in list_epv: key_prefix = epv.get('ecosystem') + "/" + epv.get('name') + "/" + epv.get('version') list_keys.extend(data_source.list_files(prefix=key_prefix)) # end of if graph_meta is None: # Import the S3 data dict_grouped_keys = _group_keys_by_epv(list_keys, data_source) report = _import_grouped_keys(data_source, dict_grouped_keys) # In the end, update the meta-data in the graph. if report.get('max_finished_at') is not None: dict_graph_meta = { 'last_incremental_update_timestamp': report.get('max_finished_at'), 'last_imported_epv': report.get('last_imported_EPV') } GraphPopulator.update_metadata(dict_graph_meta) _log_report_msg("import_bulk()", report) except Exception as e: msg = _get_exception_msg("import_bulk() failed with error", e) raise RuntimeError(msg) return report
def test_full_import_and_incr_update(): data_dir = 'test/data' # Let us make sure that target graph has no metadata graph_meta = GraphPopulator.get_metadata() assert (graph_meta is None) # Full import: insert all the EPVs from the given data source src_dir = os.path.join(data_dir, 'full_import') report = import_bulk(data_source=LocalFileSystem(src_dir=src_dir), book_keeper=None) assert (report.get('status') == 'Success') assert (report.get('count_imported_EPVs') == 1) assert (report.get('last_imported_EPV') == 'npm/serve-static/1.7.1.json') assert (report.get('max_finished_at') == '2017-02-08T12:26:51.962609') graph_meta = GraphPopulator.get_metadata() assert (graph_meta is not None) assert (graph_meta.last_incr_update_ts == '2017-02-08T12:26:51.962609') # Incremental update 1: # Let us mimic a scenario where a new EPV was inserted recently: npm/send/0.10.1 src_dir = os.path.join(data_dir, 'incr_update1') book_keeping_json = os.path.join(data_dir, 'book_keeping1.json') report = import_bulk( data_source=LocalFileSystem(src_dir=src_dir), book_keeper=JsonBookKeeper(json_file_name=book_keeping_json)) assert (report.get('status') == 'Success') assert (report.get('count_imported_EPVs') == 1) assert (report.get('last_imported_EPV') == 'npm/send/0.10.1.json') assert (report.get('max_finished_at') == '2017-02-22T15:34:59.469864') graph_meta = GraphPopulator.get_metadata() assert (graph_meta is not None) assert (graph_meta.last_incr_update_ts == '2017-02-22T15:34:59.469864') # Incremental update 2: # Let us mimic a scenario where a new EPV was inserted recently: npm/parseurl/1.3.1 # and also an already existing EPV was updated recently: npm/serve-static/1.7.1 src_dir = os.path.join(data_dir, 'incr_update2') book_keeping_json = os.path.join(data_dir, 'book_keeping2.json') report = import_bulk( data_source=LocalFileSystem(src_dir=src_dir), book_keeper=JsonBookKeeper(json_file_name=book_keeping_json)) assert (report.get('status') == 'Success') assert (report.get('count_imported_EPVs') == 2) assert (report.get('last_imported_EPV') == 'npm/serve-static/1.7.1.json') assert (report.get('max_finished_at') == '2017-02-22T15:35:51.962609') graph_meta = GraphPopulator.get_metadata() assert (graph_meta is not None) assert (graph_meta.last_incr_update_ts == '2017-02-22T15:35:51.962609') # Cleanup GraphMetaData.delete_all() assert (GraphMetaData.count() == 0) LicenseDetails.delete_all() assert (LicenseDetails.count() == 0) Author.delete_all() assert (Author.count() == 0) CodeMetricsResult.delete_all() assert (CodeMetricsResult.count() == 0) CodeMetricsLanguage.delete_all() assert (CodeMetricsLanguage.count() == 0) GithubResult.delete_all() assert (GithubResult.count() == 0) Contributor.delete_all() assert (Contributor.count() == 0) Package.delete_all() assert (Package.count() == 0) Version.delete_all() assert (Version.count() == 0)