def test_sleep_for_rate_limit(self): """Test whether the time to reset is zero if the sleep time is negative""" client = MockedClient(CLIENT_API_URL, sleep_time=0.1, max_retries=1, min_rate_to_sleep=100, sleep_for_rate=True) client.rate_limit = 50 self.rate_limit_reset_ts = -1 before = datetime_utcnow().replace(microsecond=0).timestamp() client.sleep_for_rate_limit() after = datetime_utcnow().replace(microsecond=0).timestamp() self.assertEqual(before, after)
def test_archived_after(self): """Test if only those items archived after a date are returned""" manager = ArchiveManager(self.test_path) category = 'mock_item' args = { 'origin': 'http://example.com/', 'tag': 'test', 'subtype': 'mocksubtype', 'from-date': str_to_datetime('2015-01-01') } items = fetch(CommandBackend, args, category, manager=manager) items = [item for item in items] self.assertEqual(len(items), 5) archived_dt = datetime_utcnow() items = fetch(CommandBackend, args, category, manager=manager) items = [item for item in items] self.assertEqual(len(items), 5) # Fetch items from the archive items = fetch_from_archive(CommandBackend, args, manager, category, str_to_datetime('1970-01-01')) items = [item for item in items] self.assertEqual(len(items), 10) # Fetch items archived after the given date items = fetch_from_archive(CommandBackend, args, manager, category, archived_dt) items = [item for item in items] self.assertEqual(len(items), 5)
def __init__(self, type, job_id, task_id, payload): self.uuid = str(uuid.uuid4()) self.timestamp = datetime_utcnow() self.type = type self.job_id = job_id self.task_id = task_id self.payload = payload
def metadata(self, item, filter_classified=False): """Add metadata to an item. It adds metadata to a given item such as how and when it was fetched. The contents from the original item will be stored under the 'data' keyword. :param item: an item fetched by a backend :param filter_classified: sets if classified fields were filtered """ item = { 'backend_name': self.__class__.__name__, 'backend_version': self.version, 'perceval_version': __version__, 'timestamp': datetime_utcnow().timestamp(), 'origin': self.origin, 'uuid': uuid(self.origin, self.metadata_id(item)), 'updated_on': self.metadata_updated_on(item), 'classified_fields_filtered': self.classified_fields if filter_classified else None, 'category': self.metadata_category(item), 'search_fields': self.search_fields(item), 'tag': self.tag, 'data': item, } return item
def __init__(self, task_id, backend, category, backend_args, archiving_cfg=None, scheduling_cfg=None): try: bklass = perceval.backend.find_backends( perceval.backends)[0][backend] except KeyError: raise NotFoundError(element=backend) self._task_id = task_id self._has_resuming = bklass.has_resuming() self.status = TaskStatus.NEW self.age = 0 self.num_failures = 0 self.jobs = [] self.created_on = datetime_utcnow().timestamp() self.backend = backend self.category = category self.backend_args = backend_args self.archiving_cfg = archiving_cfg if archiving_cfg else None self.scheduling_cfg = scheduling_cfg if scheduling_cfg else None
def fetch_items(self, category, **kwargs): """Fetch the entries. :param category: the category of items to fetch :param kwargs: backend arguments :returns: a generator of items """ logger.info("Looking for a mkt form at '%s'", self.origin) nentries = 0 entries = self.client.get_entries() for item in _parse_entries(entries): # Need to pass which columns are IDs to metadata_id static function ret = {'_id_columns': ID_COLUMNS} for i, column in enumerate(CSV_HEADER.split(',')): value = item[i] if isinstance(item[i], str): value = item[i].strip() ret[column.strip()] = value ret['timestamp'] = datetime_utcnow().isoformat() yield ret nentries += 1 logger.info("Done. %s form entries fetched", nentries)
def __parse_hits(self, hit_raw): """Parse the hits returned by the Google Search API""" # Create the soup and get the desired div bs_result = bs4.BeautifulSoup(hit_raw, 'html.parser') hit_string = bs_result.find("div", id="resultStats").text # Remove commas or dots hit_string = hit_string.replace(',', u'') hit_string = hit_string.replace('.', u'') fetched_on = datetime_utcnow().timestamp() id_args = self.keywords[:] id_args.append(str(fetched_on)) hits_json = { 'fetched_on': fetched_on, 'id': uuid(*id_args), 'keywords': self.keywords, 'type': 'googleSearchHits' } if not hit_string: logger.warning("No hits for %s", self.keywords) hits_json['hits'] = 0 return hits_json str_hits = re.search(r'\d+', hit_string).group(0) hits = int(str_hits) hits_json['hits'] = hits return hits_json
def __get_rich_bugs(self, data): """Create enriched data for bugs""" rich_bugtask = {} # Time to if not data["is_complete"]: rich_bugtask["time_open_days"] = get_time_diff_days( data['date_created'], datetime_utcnow().replace(tzinfo=None)) else: rich_bugtask["time_open_days"] = get_time_diff_days( data['date_created'], data['date_closed']) rich_bugtask["time_created_to_assigned"] = get_time_diff_days( data['date_created'], data['date_assigned']) rich_bugtask['time_assigned_to_closed'] = get_time_diff_days( data['date_assigned'], data['date_closed']) rich_bugtask["time_to_close_days"] = get_time_diff_days( data['date_created'], data['date_closed']) if data['activity_data']: rich_bugtask['time_to_last_update_days'] = \ get_time_diff_days(data['date_created'], data['activity_data'][-1]['datechanged']) rich_bugtask['reopened'] = 1 if data['date_left_closed'] else 0 rich_bugtask['time_to_fix_commit'] = get_time_diff_days( data['date_created'], data['date_fix_committed']) rich_bugtask['time_worked_on'] = get_time_diff_days( data['date_in_progress'], data['date_fix_committed']) rich_bugtask['time_to_confirm'] = get_time_diff_days( data['date_created'], data['date_confirmed']) # Author and assignee data owner = data.get('owner_data', None) if owner: rich_bugtask['user_login'] = owner.get('name', None) rich_bugtask['user_name'] = owner.get('display_name', None) rich_bugtask['user_joined'] = owner.get('date_created', None) rich_bugtask['user_karma'] = owner.get('karma', None) rich_bugtask['user_time_zone'] = owner.get('time_zone', None) assignee = data.get('assignee_data', None) if assignee: assignee = data['assignee_data'] rich_bugtask['assignee_login'] = assignee.get('name', None) rich_bugtask['assignee_name'] = assignee.get('display_name', None) rich_bugtask['assignee_joined'] = assignee.get( 'date_created', None) rich_bugtask['assignee_karma'] = assignee.get('karma', None) rich_bugtask['assignee_time_zone'] = assignee.get( 'time_zone', None) # Extract info related to bug rich_bugtask.update(self.__extract_bug_info(data['bug_data'])) rich_bugtask['time_to_first_attention'] = \ get_time_diff_days(data['date_created'], self.get_time_to_first_attention(data)) rich_bugtask['activity_count'] = len(data['activity_data']) return rich_bugtask
def fetch_items(self, category, **kwargs): """Fetch the metrics :param category: the category of items to fetch :param kwargs: backend arguments :returns: a generator of items """ from_date = kwargs['from_date'] nmetrics = 0 component_metrics_raw = self.client.component_metrics(from_date=from_date) component = json.loads(component_metrics_raw)['component'] for metric in component['measures']: fetched_on = datetime_utcnow().timestamp() id_args = [component['key'], metric['metric'], str(fetched_on)] metric['id'] = uuid(*id_args) metric['fetched_on'] = fetched_on yield metric nmetrics += 1 logger.info("Fetch process completed: %s metrics fetched", nmetrics)
def calculate_time_to_reset(self): """Number of seconds to wait. They are contained in the rate limit reset header""" time_to_reset = self.rate_limit_reset_ts - (datetime_utcnow().replace(microsecond=0).timestamp() + 1) time_to_reset = 0 if time_to_reset < 0 else time_to_reset return time_to_reset
def test_datetime_utcnow(self): """Check whether timezone information is added""" now = datetime_utcnow() timezone = str(now.tzinfo) expected = "UTC+00:00" self.assertTrue(timezone, expected)
def __fetch_summary(self): """Fetch summary""" raw_summary = self.client.summary() summary = json.loads(raw_summary) summary['fetched_on'] = str(datetime_utcnow()) yield summary
def enrich_git_branches(self, ocean_backend, enrich_backend, run_month_days=[7, 14, 21, 28]): """Update the information about branches within the documents representing commits in the enriched index. The example below shows how to activate the study by modifying the setup.cfg. The study `enrich_git_branches` will be run on days depending on the parameter `run_month_days`, by default the days are 7, 14, 21, and 28 of each month. ``` [git] raw_index = git_raw enriched_index = git_enriched ... studies = [enrich_git_branches] [enrich_git_branches] run_month_days = [5, 22] ``` :param ocean_backend: the ocean backend :param enrich_backend: the enrich backend :param run_month_days: days of the month to run this study """ logger.debug("[git] study git-branches start") day = datetime_utcnow().day run_month_days = list(map(int, run_month_days)) if day not in run_month_days: logger.debug("[git] study git-branches will execute only the days {} of each month".format(run_month_days)) logger.debug("[git] study git-branches end") return for ds in self.prjs_map: if ds != "git": continue urls = self.prjs_map[ds] for url in urls: cmd = GitCommand(*[url]) git_repo = GitRepository(cmd.parsed_args.uri, cmd.parsed_args.gitpath) logger.debug("[git] study git-branches delete branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) self.delete_commit_branches(git_repo, enrich_backend) logger.debug("[git] study git-branches add branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) try: self.add_commit_branches(git_repo, enrich_backend) except Exception as e: logger.error("[git] study git-branches failed on repo {}, due to {}".format(git_repo.uri, e)) continue logger.debug("[git] study git-branches repo {} in index {} processed".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) logger.debug("[git] study git-branches end")
def calculate_time_to_reset(self): """Calculate the seconds to reset the token requests, by obtaining the different between the current date and the next date when the token is fully regenerated. """ time_to_reset = self.rate_limit_reset_ts - (datetime_utcnow().replace(microsecond=0).timestamp() + 1) time_to_reset = 0 if time_to_reset < 0 else time_to_reset return time_to_reset
def decorator(self, *args, **kwargs): eitem = func(self, *args, **kwargs) metadata = { 'metadata__gelk_version': self.gelk_version, 'metadata__gelk_backend_name': self.__class__.__name__, 'metadata__enriched_on': datetime_utcnow().isoformat() } eitem.update(metadata) return eitem
def test_search_archived_after(self): """Check if a set of archives created after a given date are searched""" archive_mng_path = os.path.join(self.test_path, ARCHIVE_TEST_DIR) manager = ArchiveManager(archive_mng_path) # First set of archives to create metadata = [ { 'origin': 'https://example.com', 'backend_name': 'git', 'backend_version': '0.8', 'category': 'commit', 'backend_params': {}, }, { 'origin': 'https://example.com', 'backend_name': 'gerrit', 'backend_version': '0.1', 'category': 'changes', 'backend_params': {} }, ] for meta in metadata: archive = manager.create_archive() archive.init_metadata(**meta) # Second set, archived after the date we'll use to search after_dt = datetime_utcnow() metadata = [ { 'origin': 'https://example.org', 'backend_name': 'git', 'backend_version': '0.1', 'category': 'commit', 'backend_params': {} }, { 'origin': 'https://example.com', 'backend_name': 'git', 'backend_version': '0.1', 'category': 'commit', 'backend_params': {} } ] for meta in metadata: archive = manager.create_archive() archive.init_metadata(**meta) meta['filepath'] = archive.archive_path archives = manager.search('https://example.com', 'git', 'commit', after_dt) expected = [metadata[1]['filepath']] self.assertListEqual(archives, expected)
def fetch(self, from_date=DEFAULT_DATETIME): """Fetch the mbox files from the remote archiver. This method stores the archives in the path given during the initialization of this object. HyperKitty archives are accessed month by month and stored following the schema year-month. Archives are fetched from the given month till the current month. :param from_date: fetch archives that store messages equal or after the given date; only year and month values are compared :returns: a list of tuples, storing the links and paths of the fetched archives """ logger.info("Downloading mboxes from '%s' to since %s", self.client.base_url, str(from_date)) logger.debug("Storing mboxes in '%s'", self.dirpath) self.client.fetch(self.client.base_url) from_date = datetime_to_utc(from_date) to_end = datetime_utcnow() to_end += dateutil.relativedelta.relativedelta(months=1) months = months_range(from_date, to_end) fetched = [] if not os.path.exists(self.dirpath): os.makedirs(self.dirpath) tmbox = 0 for dts in months: tmbox += 1 start, end = dts[0], dts[1] filename = start.strftime("%Y-%m.mbox.gz") filepath = os.path.join(self.dirpath, filename) url = urijoin(self.client.base_url, 'export', filename) params = { 'start': start.strftime("%Y-%m-%d"), 'end': end.strftime("%Y-%m-%d") } success = self._download_archive(url, params, filepath) if success: fetched.append((url, filepath)) logger.info("%s/%s MBoxes downloaded", len(fetched), tmbox) return fetched
def __init__(self, uri, archive=None, from_archive=False): if uri.startswith('file://'): self.file_path = uri.split('file://', 1)[1] else: self.file_path = tempfile.mkdtemp( ) + "/perceval-ow2-mkt-backend-" + str(datetime_utcnow()) + ".csv" super().__init__(uri, archive=archive, from_archive=from_archive) response = self.session.get(uri) open(self.file_path, 'wb').write(response.content)
def __fetch_repo_info(self): """Get repo info about stars, watchers and forks""" raw_repo = self.client.repo() repo = json.loads(raw_repo) fetched_on = datetime_utcnow() repo['fetched_on'] = fetched_on.timestamp() yield repo
def calculate_time_to_reset(self): """Number of seconds to wait. They are contained in the rate limit reset header""" current_epoch = (datetime_utcnow().replace(microsecond=0).timestamp() + 1) * 1000 time_to_reset = (self.rate_limit_reset_ts - current_epoch) / 1000 if time_to_reset < 0: time_to_reset = 0 return time_to_reset
def enrich_onion(self, enrich_backend, in_index, out_index, data_source, contribs_field, timeframe_field, sort_on_field, seconds=ONION_INTERVAL, no_incremental=False): log_prefix = "[" + data_source + "] study onion" logger.info(log_prefix + " Starting study - Input: " + in_index + " Output: " + out_index) # Creating connections es = Elasticsearch([enrich_backend.elastic.url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify) in_conn = ESOnionConnector(es_conn=es, es_index=in_index, contribs_field=contribs_field, timeframe_field=timeframe_field, sort_on_field=sort_on_field) out_conn = ESOnionConnector(es_conn=es, es_index=out_index, contribs_field=contribs_field, timeframe_field=timeframe_field, sort_on_field=sort_on_field, read_only=False) if not in_conn.exists(): logger.info(log_prefix + " Missing index %s", in_index) return # Check last execution date latest_date = None if out_conn.exists(): latest_date = out_conn.latest_enrichment_date() if latest_date: logger.info(log_prefix + " Latest enrichment date: " + latest_date.isoformat()) update_after = latest_date + timedelta(seconds=seconds) logger.info(log_prefix + " Update after date: " + update_after.isoformat()) if update_after >= datetime_utcnow(): logger.info(log_prefix + " Too soon to update. Next update will be at " + update_after.isoformat()) return # Onion currently does not support incremental option logger.info(log_prefix + " Creating out ES index") # Initialize out index filename = pkg_resources.resource_filename('grimoire_elk', 'enriched/mappings/onion.json') out_conn.create_index(filename, delete=out_conn.exists()) onion_study(in_conn=in_conn, out_conn=out_conn, data_source=data_source) # Create alias if output index exists (index is always created from scratch, so # alias need to be created each time) if out_conn.exists() and not out_conn.exists_alias(out_index, ONION_ALIAS): logger.info(log_prefix + " Creating alias: %s", ONION_ALIAS) out_conn.create_alias(ONION_ALIAS) logger.info(log_prefix + " This is the end.")
def main(): """This script downloads and processes the archives from githubarchive.com between two dates (--from-date and --to-date) to a folder (--folder). It returns a CSV file (--output), which contains the pull requests and issues opened by a set of GitHub users (included in the file --usernames). """ logging.getLogger().setLevel(logging.DEBUG) args = parser(sys.argv[1:]) folder = args.folder download = args.download from_date = args.from_date to_date = args.to_date usernames = args.usernames output = args.output start_time = datetime_utcnow().isoformat() logging.debug("script started at: %s", start_time) if not os.path.exists(folder): os.makedirs(folder) if download: download_archives(folder, from_date, to_date) if not os.path.exists(output): with open(output, 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') writer.writerow([g for g in SCHEMA]) with open(usernames, 'r') as content: for line in content: activities = process_archives(folder, from_date, to_date, line.strip()) with open(output, 'a') as csvfile: writer = csv.writer(csvfile, delimiter=',') writer.writerows(activities) end_time = datetime_utcnow().isoformat() logging.debug("script ended at: %s", end_time)
def init_metadata(self, origin, backend_name, backend_version, category, backend_params): """Init metadata information. Metatada is composed by basic information needed to identify where archived data came from and how it can be retrieved and built into Perceval items. :param: origin: identifier of the repository :param: backend_name: name of the backend :param: backend_version: version of the backend :param: category: category of the items fetched :param: backend_params: dict representation of the fetch parameters raises ArchiveError: when an error occurs initializing the metadata """ created_on = datetime_to_utc(datetime_utcnow()) created_on_dumped = created_on.isoformat() backend_params_dumped = pickle.dumps(backend_params, 0) metadata = ( origin, backend_name, backend_version, category, backend_params_dumped, created_on_dumped, ) try: cursor = self._db.cursor() insert_stmt = "INSERT INTO " + self.METADATA_TABLE + " "\ "(origin, backend_name, backend_version, " \ "category, backend_params, created_on) " \ "VALUES (?, ?, ?, ?, ?, ?)" cursor.execute(insert_stmt, metadata) self._db.commit() cursor.close() except sqlite3.DatabaseError as e: msg = "metadata initialization error; cause: %s" % str(e) raise ArchiveError(cause=msg) self.origin = origin self.backend_name = backend_name self.backend_version = backend_version self.category = category self.backend_params = backend_params self.created_on = created_on logger.debug("Metadata of archive %s initialized to %s", self.archive_path, metadata)
def calculate_time_to_reset(self): """Number of seconds to wait. The time is obtained by the different between the current date and the next date when the token is fully regenerated. """ current_epoch = datetime_utcnow().replace(microsecond=0).timestamp() + 1 time_to_reset = self.rate_limit_reset_ts - current_epoch if time_to_reset < 0: time_to_reset = 0 return time_to_reset
def test_init_metadata(self): """Test whether metadata information is properly initialized""" archive_path = os.path.join(self.test_path, 'myarchive') archive = Archive.create(archive_path) before_dt = datetime_to_utc(datetime_utcnow()) archive.init_metadata('marvel.com', 'marvel-comics-backend', '0.1.0', 'issue', {'from_date': before_dt}) after_dt = datetime_to_utc(datetime_utcnow()) archive_copy = Archive(archive_path) # Both copies should have the same parameters for arch in [archive, archive_copy]: self.assertEqual(arch.origin, 'marvel.com') self.assertEqual(arch.backend_name, 'marvel-comics-backend') self.assertEqual(arch.backend_version, '0.1.0') self.assertEqual(arch.category, 'issue') self.assertGreaterEqual(arch.created_on, before_dt) self.assertLessEqual(arch.created_on, after_dt) self.assertDictEqual(arch.backend_params, {'from_date': before_dt})
def test_initialization(self): """Test if the instance is correctly initialized""" dt_before = datetime_utcnow() event = JobEvent(JobEventType.COMPLETED, '1', None) dt_after = datetime_utcnow() self.assertEqual(event.type, JobEventType.COMPLETED) self.assertEqual(event.job_id, '1') self.assertEqual(event.payload, None) self.assertGreater(event.timestamp, dt_before) self.assertLess(event.timestamp, dt_after) dt_before = datetime_utcnow() event = JobEvent(JobEventType.FAILURE, '2', "Error") dt_after = datetime_utcnow() self.assertEqual(event.type, JobEventType.FAILURE) self.assertEqual(event.job_id, '2') self.assertEqual(event.payload, "Error") self.assertGreater(event.timestamp, dt_before) self.assertLess(event.timestamp, dt_after)
def test_decorator(self): backend = MockedBackend('test', 'mytag') before = datetime_utcnow().timestamp() items = [item for item in backend.fetch()] after = datetime_utcnow().timestamp() for x in range(5): item = items[x] expected_uuid = uuid('test', str(x)) self.assertEqual(item['data']['item'], x) self.assertEqual(item['backend_name'], 'MockedBackend') self.assertEqual(item['backend_version'], '0.2.0') self.assertEqual(item['perceval_version'], __version__) self.assertEqual(item['origin'], 'test') self.assertEqual(item['uuid'], expected_uuid) self.assertEqual(item['updated_on'], '2016-01-01') self.assertEqual(item['category'], 'mock_item') self.assertEqual(item['tag'], 'mytag') self.assertGreater(item['timestamp'], before) self.assertLess(item['timestamp'], after) before = item['timestamp']
def test_calculate_time_to_reset(self): """Test whether the time to reset is zero if the sleep time is negative""" httpretty.register_uri(httpretty.GET, GITLAB_URL_PROJECT, body='', status=200, forcing_headers={ 'RateLimit-Remaining': '20', 'RateLimit-Reset': int(datetime_utcnow().replace(microsecond=0).timestamp()) }) client = GitLabClient("fdroid", "fdroiddata", "your-token") time_to_reset = client.calculate_time_to_reset() self.assertEqual(time_to_reset, 0)
def delete_items(self, hours_to_retain, time_field="metadata__updated_on"): """Delete documents updated before a given date :param hours_to_retain: maximum number of hours wrt the current date to retain the data :param time_field: time field to delete the data """ if hours_to_retain is None: logger.debug( "Data retention policy disabled, no items will be deleted.") return if hours_to_retain <= 0: logger.debug("Hours to retain must be greater than 0.") return before_date = datetime_utcnow() - datetime.timedelta( hours=hours_to_retain) before_date = before_date.replace(minute=0, second=0, microsecond=0) before_date_str = before_date.isoformat() es_query = ''' { "query": { "range": { "%s": { "lte": "%s" } } } } ''' % (time_field, before_date_str) r = self.requests.post(self.index_url + "/_delete_by_query?refresh", data=es_query, headers=HEADER_JSON, verify=False) try: r.raise_for_status() r_json = r.json() logger.debug("%s items deleted from %s before %s.", r_json['deleted'], self.anonymize_url(self.index_url), before_date) except requests.exceptions.HTTPError as ex: logger.error("Error deleted items from %s.", self.anonymize_url(self.index_url)) logger.error(ex) return
def test_search(self): """Test if a set of archives is found based on the given criteria""" archive_mng_path = os.path.join(self.test_path, ARCHIVE_TEST_DIR) manager = ArchiveManager(archive_mng_path) dt = datetime_utcnow() metadata = [ { 'origin': 'https://example.com', 'backend_name': 'git', 'backend_version': '0.8', 'category': 'commit', 'backend_params': {}, }, { 'origin': 'https://example.com', 'backend_name': 'gerrit', 'backend_version': '0.1', 'category': 'changes', 'backend_params': {} }, { 'origin': 'https://example.org', 'backend_name': 'git', 'backend_version': '0.1', 'category': 'commit', 'backend_params': {} }, { 'origin': 'https://example.com', 'backend_name': 'git', 'backend_version': '0.1', 'category': 'commit', 'backend_params': {} } ] for meta in metadata: archive = manager.create_archive() archive.init_metadata(**meta) meta['filepath'] = archive.archive_path archives = manager.search('https://example.com', 'git', 'commit', dt) expected = [metadata[0]['filepath'], metadata[3]['filepath']] self.assertListEqual(archives, expected)