def get_odes_extracts(db, api_key): ''' ''' odeses, extracts = list(), list() vars = dict(api_key=api_key) extracts_url = uritemplate.expand(odes_extracts_url, vars) resp = requests.get(extracts_url) if resp.status_code in range(200, 299): odeses.extend([data.ODES(str(oj['id']), status=oj['status'], bbox=oj['bbox'], links=oj.get('download_links', {}), processed_at=(parse_datetime(oj['processed_at']) if oj['processed_at'] else None), created_at=(parse_datetime(oj['created_at']) if oj['created_at'] else None)) for oj in resp.json()]) for odes in sorted(odeses, key=attrgetter('created_at'), reverse=True): extract = data.get_extract(db, odes=odes) if extract is None: extract = data.Extract(None, None, None, odes, None, None, None) extracts.append(extract) return extracts
def from_json(json_str): verbatim_tweet = VerbatimTweet() verbatim_tweet.json = json_str verbatim_tweet.save() obj = json.loads(json_str) tweet = Tweet() tweet.verbatim_tweet = verbatim_tweet tweet.id = obj['id'] tweet.created_at = parse_datetime(obj['created_at']) for field in ['lang', 'retweeted', 'retweet_count', 'text', 'truncated']: setattr(tweet, field, obj.get(field)) user_obj = obj['user'] user_created_at = parse_datetime(user_obj['created_at']) (user, created) = TwitterUser.objects.get_or_create(id=user_obj['id'], defaults={'created_at': user_created_at}) if created: user.name = user_obj.get('name') user.screen_name = user_obj.get('screen_name') user.verified = user_obj.get('verified') user.save() tweet.user = user tweet.save() return tweet
def parse_practices(self, response): # body > div.container > div.page-body > table > tbody practices = response.xpath('/html/body/div[2]/div[3]/table/tbody/tr') total = len(practices) for i, row in enumerate(practices): cells = [x.strip() for x in row.css('td::text').extract() if x.strip() != ''] r = PracticeSession() r['student'] = self.username r['quiz_index'] = total-i # attempt to see if the date in parentheses is more specific # than the month-day specifier (e.g. 'hours ago'), and use it if so. # otherwise, just use the month-day specifier try: inner_date = row.css('td:first-child small::text').extract()[0] inner_date_parsed = VeritasScraper.ddp.get_date_data(inner_date) r['taken_on'] = inner_date_parsed['date_obj'] \ if inner_date_parsed and "day" not in inner_date \ else parse_datetime(cells[0]) except IndexError: r['taken_on'] = parse_datetime(cells[0]) r['question_count'] = int(cells[1]) r['percent_correct'] = cells[2] r['duration'] = cells[3] yield r
def test_children_metadata(self): path = u'kind/of/magíc.mp3' record = recursively_create_file(self.node_settings, path) version = factories.FileVersionFactory() record.versions.add(version) record.save() res = self.send_hook( 'osfstorage_get_children', {'fid': record.parent._id}, {}, ) assert_equal(len(res.json), 1) res_data = res.json[0] expected_data = record.serialize() # Datetimes in response might not be exactly the same as in record.serialize # because of the way Postgres serializes dates. For example, # '2017-06-05T17:32:20.964950+00:00' will be # serialized as '2017-06-05T17:32:20.96495+00:00' by postgres # Therefore, we parse the dates then compare them expected_date_modified = parse_datetime(expected_data.pop('modified')) expected_date_created = parse_datetime(expected_data.pop('created')) res_date_modified = parse_datetime(res_data.pop('modified')) res_date_created = parse_datetime(res_data.pop('created')) assert_equal(res_date_modified, expected_date_modified) assert_equal(res_date_created, expected_date_created) assert_equal(res_data, expected_data)
def filter_logic(unlisted, timeSelect, startDate, endDate, timeTypeSelect, cvssSelect, cvss, rejectedSelect, limit, skip): query = [] # retrieving lists if rejectedSelect == "hide": exp = "^(?!\*\* REJECT \*\*\s+DO NOT USE THIS CANDIDATE NUMBER.*)" query.append({'summary': re.compile(exp)}) # cvss logic if cvssSelect != "all": if cvssSelect == "above": query.append({'cvss': {'$gt': float(cvss)}}) if cvssSelect == "equals": query.append({'cvss': float(cvss)}) if cvssSelect == "below": query.append({'cvss': {'$lt': float(cvss)}}) # date logic if timeSelect != "all": startDate = parse_datetime(startDate, ignoretz=True, dayfirst=True) endDate = parse_datetime(endDate, ignoretz=True, dayfirst=True) if timeSelect == "from": query.append({timeTypeSelect: {'$gt': startDate}}) if timeSelect == "until": query.append({timeTypeSelect: {'$lt': endDate}}) if timeSelect == "between": query.append({timeTypeSelect: {'$gt': startDate, '$lt': endDate}}) if timeSelect == "outside": query.append({'$or': [{timeTypeSelect: {'$lt': startDate}}, {timeTypeSelect: {'$gt': endDate}}]}) return dbLayer.getCVEs(limit=limit, skip=skip, query=query)
def get_odes_extract(db, id, api_key): ''' ''' extract, odes = data.get_extract(db, extract_id=id), None if extract is None: # Nothing by that name in the database, so ask the ODES API. vars = dict(id=id, api_key=api_key) extract_url = uritemplate.expand(odes_extracts_url, vars) resp = requests.get(extract_url) if resp.status_code in range(200, 299): oj = resp.json() odes = data.ODES(str(oj['id']), status=oj['status'], bbox=oj['bbox'], links=oj.get('download_links', {}), processed_at=(parse_datetime(oj['processed_at']) if oj['processed_at'] else None), created_at=(parse_datetime(oj['created_at']) if oj['created_at'] else None)) if odes is None: # Nothing at all for this ID anywhere. return None if odes is None: # A DB extract was found, but nothing in ODES - very weird! return get_odes_extract(db, extract.odes.id, api_key) # We have a known ODES, so look for it in the database. extract = data.get_extract(db, odes=odes) if extract is None: # Known ODES, but nothing in the DB so make one up. return data.Extract(None, None, None, odes, None, None, None) return extract
def find_timing_env_commands(self): content = self.get_timing_env_log_content() if not content: return [], set(['Timing env log is not found!']) commands, ordered_commands, errors = {}, [], set() for line in content.split('\n'): try: data = json.loads(line) except: continue cmd_hash_key = (data['command'],data['unique_nr']) if cmd_hash_key not in commands.keys() and data['tag'] == 'BEGIN': commands[cmd_hash_key] = data commands[cmd_hash_key]['tags'] = [data['tag']] ordered_commands.append(commands[cmd_hash_key]) elif cmd_hash_key in commands.keys() and data['tag'] in commands[cmd_hash_key]['tags']: errors.add('Found duplicated command: {}'.format(data['command'])) elif cmd_hash_key in commands.keys() and data['tag'] == 'END': commands[cmd_hash_key].update(data) commands[cmd_hash_key]['tags'].append(data['tag']) else: errors.add('Unknown error: {}'.format(data['command'])) for command in ordered_commands: command['started_at'] = parse_datetime(command['started_at']) if 'finished_at' in command: command['finished_at'] = parse_datetime(command['finished_at']) return ordered_commands, errors
def validate_datetime(dt_str): try: parse_datetime(dt_str) except ValueError as exc: return str(exc) except TypeError: return _("Invalid input.")
def generate_minimal_query(self, f): query = [] # retrieving lists if f['rejectedSelect'] == "hide": exp = "^(?!\*\* REJECT \*\*\s+DO NOT USE THIS CANDIDATE NUMBER.*)" query.append({'summary': re.compile(exp)}) # cvss logic if f['cvssSelect'] == "above": query.append({'cvss': {'$gt': float(f['cvss'])}}) elif f['cvssSelect'] == "equals": query.append({'cvss': float(f['cvss'])}) elif f['cvssSelect'] == "below": query.append({'cvss': {'$lt': float(f['cvss'])}}) # date logic if f['timeSelect'] != "all": if f['startDate']: startDate = parse_datetime(f['startDate'], ignoretz=True, dayfirst=True) if f['endDate']: endDate = parse_datetime(f['endDate'], ignoretz=True, dayfirst=True) if f['timeSelect'] == "from": query.append({f['timeTypeSelect']: {'$gt': startDate}}) elif f['timeSelect'] == "until": query.append({f['timeTypeSelect']: {'$lt': endDate}}) elif f['timeSelect'] == "between": query.append({f['timeTypeSelect']: {'$gt': startDate, '$lt': endDate}}) elif f['timeSelect'] == "outside": query.append({'$or': [{f['timeTypeSelect']: {'$lt': startDate}}, {f['timeTypeSelect']: {'$gt': endDate}}]}) return query
def test_contributors_get_aware_datetime(): """Get an aware datetime from a valid string.""" iso_datetime = make_aware(parse_datetime("2016-01-24T23:15:22+0000"), tz=pytz.utc) # Test ISO 8601 datetime. assert iso_datetime == get_aware_datetime("2016-01-24T23:15:22+0000", tz=pytz.utc) # Test git-like datetime. assert iso_datetime == get_aware_datetime("2016-01-24 23:15:22 +0000", tz=pytz.utc) # Test just an ISO 8601 date. iso_datetime = make_aware(parse_datetime("2016-01-24T00:00:00+0000"), tz=pytz.utc) assert iso_datetime == get_aware_datetime("2016-01-24", tz=pytz.utc) # Test None. assert get_aware_datetime(None) is None # Test empty string. assert get_aware_datetime("") is None # Test non-empty string. with pytest.raises(ArgumentTypeError): get_aware_datetime("THIS FAILS") # Test blank string. with pytest.raises(ArgumentTypeError): get_aware_datetime(" ")
def request_odes_extract(extract, request, url_for, api_key): ''' ''' env = Environment(loader=PackageLoader(__name__, 'templates')) args = dict( name = extract.name or extract.wof.name or 'an unnamed place', link = urljoin(util.get_base_url(request), url_for('ODES.get_extract', extract_id=extract.id)), extracts_link = urljoin(util.get_base_url(request), url_for('ODES.get_extracts')), created = extract.created ) email = dict( email_subject=env.get_template('email-subject.txt').render(**args), email_body_text=env.get_template('email-body.txt').render(**args), email_body_html=env.get_template('email-body.html').render(**args) ) params = {key: extract.envelope.bbox[i] for (i, key) in enumerate(('bbox_w', 'bbox_s', 'bbox_e', 'bbox_n'))} params.update(email) post_url = uritemplate.expand(odes_extracts_url, dict(api_key=api_key)) resp = requests.post(post_url, data=params) oj = resp.json() if 'error' in oj: raise util.KnownUnknown("Error: {}".format(oj['error'])) elif resp.status_code != 200: raise Exception("Bad ODES status code: {}".format(resp.status_code)) return data.ODES(str(oj['id']), status=oj['status'], bbox=oj['bbox'], links=oj.get('download_links', {}), processed_at=(parse_datetime(oj['processed_at']) if oj['processed_at'] else None), created_at=(parse_datetime(oj['created_at']) if oj['created_at'] else None))
def entry_to_event_tuple(entry): title = entry.title.text description = entry.content.text or '-' description = description.splitlines()[0] starts_at = parse_datetime(entry.when[0].start) ends_at = parse_datetime(entry.when[0].end) duration = (ends_at - starts_at).seconds / 3600.0 return (title, description, duration)
def from_dict(cls, obj): return cls( obj['summary'], obj['guid'], parse_datetime(obj['start']['utcdate']).replace(tzinfo=pytz.timezone("America/New_York")).astimezone(pytz.utc), parse_datetime(obj['end']['utcdate']).replace(tzinfo=pytz.timezone("America/New_York")).astimezone(pytz.utc), obj['start']['allday'] == 'true' )
def filter_logic(f, limit, skip): query = [] # retrieving lists if f['blacklistSelect'] == "on": regexes = db.getRules('blacklist') if len(regexes) != 0: exp = "^(?!" + "|".join(regexes) + ")" query.append({'$or': [{'vulnerable_configuration': re.compile(exp)}, {'vulnerable_configuration': {'$exists': False}}, {'vulnerable_configuration': []} ]}) if f['whitelistSelect'] == "hide": regexes = db.getRules('whitelist') if len(regexes) != 0: exp = "^(?!" + "|".join(regexes) + ")" query.append({'$or': [{'vulnerable_configuration': re.compile(exp)}, {'vulnerable_configuration': {'$exists': False}}, {'vulnerable_configuration': []} ]}) if f['unlistedSelect'] == "hide": wlregexes = compile(db.getRules('whitelist')) blregexes = compile(db.getRules('blacklist')) query.append({'$or': [{'vulnerable_configuration': {'$in': wlregexes}}, {'vulnerable_configuration': {'$in': blregexes}}]}) if f['rejectedSelect'] == "hide": exp = "^(?!\*\* REJECT \*\*\s+DO NOT USE THIS CANDIDATE NUMBER.*)" query.append({'summary': re.compile(exp)}) # plugin filters query.extend(plugManager.doFilter(f, **pluginArgs())) # cvss logic if f['cvssSelect'] == "above": query.append({'cvss': {'$gt': float(f['cvss'])}}) elif f['cvssSelect'] == "equals": query.append({'cvss': float(f['cvss'])}) elif f['cvssSelect'] == "below": query.append({'cvss': {'$lt': float(f['cvss'])}}) # date logic if f['timeSelect'] != "all": if f['startDate']: startDate = parse_datetime(f['startDate'], ignoretz=True, dayfirst=True) if f['endDate']: endDate = parse_datetime(f['endDate'], ignoretz=True, dayfirst=True) if f['timeSelect'] == "from": query.append({f['timeTypeSelect']: {'$gt': startDate}}) if f['timeSelect'] == "until": query.append({f['timeTypeSelect']: {'$lt': endDate}}) if f['timeSelect'] == "between": query.append({f['timeTypeSelect']: {'$gt': startDate, '$lt': endDate}}) if f['timeSelect'] == "outside": query.append({'$or': [{f['timeTypeSelect']: {'$lt': startDate}}, {f['timeTypeSelect']: {'$gt': endDate}}]}) cve=db.getCVEs(limit=limit, skip=skip, query=query) # marking relevant records if f['whitelistSelect'] == "on": cve = whitelist_mark(cve) if f['blacklistSelect'] == "mark": cve = blacklist_mark(cve) plugManager.mark(cve, **pluginArgs()) cve = list(cve) return cve
def _assert_is_datetime(self, timestamp): if not timestamp: return False try: parse_datetime(timestamp) except ValueError: return False else: return True
def from_line(cls, line, lineidx=None): m = CLOCK_RE.match(line) start = (m.group("start")) end = (m.group("end")) if start: start = parse_datetime(m.group("start"), fuzzy=True) if end: end = parse_datetime(m.group("end"), fuzzy=True) return cls(start=start, end=end, lineidx=lineidx)
def get_group_tag_keys_and_top_values( self, project_id, group_id, environment_ids, user=None, keys=None, value_limit=TOP_VALUES_DEFAULT_LIMIT): # Similar to __get_tag_key_and_top_values except we get the top values # for all the keys provided. value_limit in this case means the number # of top values for each key, so the total rows returned should be # num_keys * limit. start, end = self.get_time_range() # First get totals and unique counts by key. keys_with_counts = self.get_group_tag_keys(project_id, group_id, environment_ids, keys=keys) # Then get the top values with first_seen/last_seen/count for each filters = { 'project_id': [project_id], } if environment_ids: filters['environment'] = environment_ids if keys is not None: filters['tags_key'] = keys if group_id is not None: filters['issue'] = [group_id] aggregations = [ ['count()', '', 'count'], ['min', SEEN_COLUMN, 'first_seen'], ['max', SEEN_COLUMN, 'last_seen'], ] conditions = [['tags_key', 'NOT IN', self.EXCLUDE_TAG_KEYS]] values_by_key = snuba.query( start, end, ['tags_key', 'tags_value'], conditions, filters, aggregations, orderby='-count', limitby=[value_limit, 'tags_key'], referrer='tagstore.__get_tag_keys_and_top_values' ) # Then supplement the key objects with the top values for each. if group_id is None: value_ctor = TagValue else: value_ctor = functools.partial(GroupTagValue, group_id=group_id) for keyobj in keys_with_counts: key = keyobj.key values = values_by_key.get(key, []) keyobj.top_values = [ value_ctor( key=keyobj.key, value=value, times_seen=data['count'], first_seen=parse_datetime(data['first_seen']), last_seen=parse_datetime(data['last_seen']), ) for value, data in six.iteritems(values) ] return keys_with_counts
def _assert_is_datetime(self, timestamp): """ Internal helper to validate the type of the provided timestamp """ if not timestamp: return False try: parse_datetime(timestamp) except ValueError: return False else: return True
def from_dict(cls, obj): return cls( obj["summary"], obj["guid"], parse_datetime(obj["start"]["utcdate"]) .replace(tzinfo=pytz.timezone("America/New_York")) .astimezone(pytz.utc), parse_datetime(obj["end"]["utcdate"]) .replace(tzinfo=pytz.timezone("America/New_York")) .astimezone(pytz.utc), obj["start"]["allday"] == "true", )
def _parse_args(arguments): if arguments['--start'] is not None: start_datetime = _make_utc(parse_datetime(arguments['--start'])) else: start_datetime = datetime.datetime.now(pytz.UTC).replace( second=0, microsecond=0) if arguments['--end'] is not None: end_datetime = _make_utc(parse_datetime(arguments['--end'])) else: end_datetime = start_datetime + datetime.timedelta(minutes=30) return arguments['--constituents'], start_datetime, end_datetime
def __get_tag_key_and_top_values(self, project_id, group_id, environment_id, key, limit=3, raise_on_empty=True): start, end = self.get_time_range() tag = u'tags[{}]'.format(key) filters = { 'project_id': [project_id], } if environment_id: filters['environment'] = [environment_id] if group_id is not None: filters['issue'] = [group_id] conditions = [[tag, '!=', '']] aggregations = [ ['uniq', tag, 'values_seen'], ['count()', '', 'count'], ['min', SEEN_COLUMN, 'first_seen'], ['max', SEEN_COLUMN, 'last_seen'], ] result, totals = snuba.query( start, end, [tag], conditions, filters, aggregations, orderby='-count', limit=limit, totals=True, referrer='tagstore.__get_tag_key_and_top_values' ) if raise_on_empty and (not result or totals.get('count', 0) == 0): raise TagKeyNotFound if group_id is None else GroupTagKeyNotFound else: if group_id is None: key_ctor = TagKey value_ctor = TagValue else: key_ctor = functools.partial(GroupTagKey, group_id=group_id) value_ctor = functools.partial(GroupTagValue, group_id=group_id) top_values = [ value_ctor( key=key, value=value, times_seen=data['count'], first_seen=parse_datetime(data['first_seen']), last_seen=parse_datetime(data['last_seen']), ) for value, data in six.iteritems(result) ] return key_ctor( key=key, values_seen=totals.get('values_seen', 0), count=totals.get('count', 0), top_values=top_values )
def search(self, search_value, search_key=None): """ Searches and returns a ShipmentTrack instance for the given keyword. If nothing found, raises :class:`NoMatchFound` exception. """ url = self.config.get_url(search_value, search_key) resource_as_xml = urllib.urlopen(url).read() resource_as_json = xml2json.xml2json(resource_as_xml) resource_as_dict = json.loads(resource_as_json) if not isinstance(resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"], dict): raise NoMatchFound("No match found for %s" % (search_value)) st = ShipmentTrack( SEARCHON=parse_datetime(resource_as_dict["SHIPMENTTRACK"]["SEARCHON"]), AWB=resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["AWB"], CONSIGNEENAME=resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["CONSIGNEENAME"], CONSIGNORNAME=resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["CONSIGNORNAME"], CURRENTSTATUS=resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["CURRENTSTATUS"], DESTINATION=resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["DESTINATION"], ORIGIN=resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["ORIGIN"], PICKUPDATE=parse_datetime(resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["PICKUPDATE"]), SHIPMENTREFERENCENUMBER=resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["SHIPMENTREFERENCENUMBER"], TOTALWEIGHT=Decimal(resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["TOTALWEIGHT"])) cps_raw = resource_as_dict["SHIPMENTTRACK"]["SHIPMENTREPORT"]["CHECKPOINTDETAILS"]["CHECKPOINTS"] cps_tba = [] if isinstance(cps_raw, list): for cp in cps_raw: cps_tba.append(ShipmentTrackCP( CHECKDATE=cp["CHECKDATE"], CHECKPOINT=cp["CHECKPOINT"], CHECKPOINTDESCRIPTION=cp["CHECKPOINTDESCRIPTION"], CHECKTIME=cp["CHECKTIME"], LOCATIONNAME=cp["LOCATIONNAME"], CHECKDATETIME=parse_datetime("%s %s" % (cp["CHECKDATE"], cp["CHECKTIME"])))) else: cp = cps_raw cps_tba.append(ShipmentTrackCP( CHECKDATE=cp["CHECKDATE"], CHECKPOINT=cp["CHECKPOINT"], CHECKPOINTDESCRIPTION=cp["CHECKPOINTDESCRIPTION"], CHECKTIME=cp["CHECKTIME"], LOCATIONNAME=cp["LOCATIONNAME"], CHECKDATETIME=parse_datetime("%s %s" % (cp["CHECKDATE"], cp["CHECKTIME"])))) cps_tba = sorted(cps_tba, key=lambda x: x["CHECKDATETIME"]) for cp in cps_tba: st.add_cp(cp) return st
def __init__(self, **kwargs): self.key = kwargs.get('key') if 'expiration' in kwargs: self.expiration = parse_datetime(kwargs['expiration']) else: self.expiration = None
def update(self, oauth_token): logging.debug("Updating issue %i" % self.number) # Record basic information about this pull request issue_response = raw_github_request(PULLS_BASE + '/%i' % self.number, oauth_token=oauth_token, etag=self.etag) if issue_response is None: logging.debug("Issue %i hasn't changed since last visit; skipping" % self.number) return self.pr_json = json.loads(issue_response.content) self.etag = issue_response.headers["ETag"] updated_at = \ parse_datetime(self.pr_json['updated_at']).astimezone(tz.tzutc()).replace(tzinfo=None) self.user = self.pr_json['user']['login'] self.updated_at = updated_at self.state = self.pr_json['state'] # TODO: will miss comments if we exceed the pagination limit: comments_response = raw_github_request(ISSUES_BASE + '/%i/comments' % self.number, oauth_token=oauth_token, etag=self.comments_etag) if comments_response is not None: self.comments_json = json.loads(comments_response.content) self.comments_etag = comments_response.headers["ETag"] files_response = raw_github_request(PULLS_BASE + "/%i/files" % self.number, oauth_token=oauth_token, etag=self.files_etag) if files_response is not None: self.files_json = json.loads(files_response.content) self.files_etag = files_response.headers["ETag"] self.cached_last_jenkins_outcome = None self.last_jenkins_outcome # force recomputation of Jenkins outcome self.cached_commenters = self._compute_commenters() # Write our modifications back to the database self.put()
def test_releases_request(self): now = parse_datetime('2018-03-09T01:00:00Z') project = self.create_project() release = Release.objects.create( organization_id=self.organization.id, version='version X', date_added=now, ) release.add_project(project) dts = [now + timedelta(hours=i) for i in range(4)] with responses.RequestsMock() as rsps: def snuba_response(request): body = json.loads(request.body) assert body['aggregations'] == [['count()', None, 'aggregate']] assert body['project'] == [project.id] assert body['groupby'] == ['release', 'time'] assert ['release', 'IN', ['version X']] in body['conditions'] return (200, {}, json.dumps({ 'data': [{'release': 'version X', 'time': '2018-03-09T01:00:00Z', 'aggregate': 100}], 'meta': [{'name': 'release'}, {'name': 'time'}, {'name': 'aggregate'}] })) rsps.add_callback( responses.POST, settings.SENTRY_SNUBA + '/query', callback=snuba_response) results = self.db.get_range( TSDBModel.release, [release.id], dts[0], dts[-1], rollup=3600) assert results == { release.id: [ (int(to_timestamp(d)), 100 if d == now else 0) for d in dts] }
def test_groups_request(self): now = parse_datetime('2018-03-09T01:00:00Z') dts = [now + timedelta(hours=i) for i in range(4)] project = self.create_project() group = self.create_group(project=project) GroupHash.objects.create(project=project, group=group, hash='0' * 32) group2 = self.create_group(project=project) GroupHash.objects.create(project=project, group=group2, hash='1' * 32) with responses.RequestsMock() as rsps: def snuba_response(request): body = json.loads(request.body) assert body['aggregations'] == [['count()', None, 'aggregate']] assert body['project'] == [project.id] assert body['groupby'] == ['issue', 'time'] # Assert issue->hash map is generated, but only for referenced issues assert [group.id, ['0' * 32]] in body['issues'] assert [group2.id, ['1' * 32]] not in body['issues'] return (200, {}, json.dumps({ 'data': [{'time': '2018-03-09T01:00:00Z', 'issue': 1, 'aggregate': 100}], 'meta': [{'name': 'time'}, {'name': 'issue'}, {'name': 'aggregate'}] })) rsps.add_callback( responses.POST, settings.SENTRY_SNUBA + '/query', callback=snuba_response) results = self.db.get_range(TSDBModel.group, [group.id], dts[0], dts[-1]) assert results is not None
def to_python(obj, in_dict, str_keys=None, date_keys=None, int_keys=None, object_map=None, bool_keys=None, dict_keys=None, **kwargs): """Extends a given object for API Consumption. :param obj: Object to extend. :param in_dict: Dict to extract data from. :param string_keys: List of in_dict keys that will be extracted as strings. :param date_keys: List of in_dict keys that will be extrad as datetimes. :param object_map: Dict of {key, obj} map, for nested object results. """ d = dict() if str_keys: for in_key in str_keys: d[in_key] = in_dict.get(in_key) if date_keys: for in_key in date_keys: in_date = in_dict.get(in_key) try: out_date = parse_datetime(in_date) except TypeError, e: raise e out_date = None d[in_key] = out_date
def to_api(in_dict, int_keys=None, date_keys=None, bool_keys=None): """Extends a given object for API Production.""" # Cast all int_keys to int() if int_keys: for in_key in int_keys: if (in_key in in_dict) and (in_dict.get(in_key, None) is not None): in_dict[in_key] = int(in_dict[in_key]) # Cast all date_keys to datetime.isoformat if date_keys: for in_key in date_keys: if (in_key in in_dict) and (in_dict.get(in_key, None) is not None): _from = in_dict[in_key] if isinstance(_from, basestring): dtime = parse_datetime(_from) elif isinstance(_from, datetime): dtime = _from in_dict[in_key] = dtime.isoformat() elif (in_key in in_dict) and in_dict.get(in_key, None) is None: del in_dict[in_key] # Remove all Nones for k, v in in_dict.items(): if v is None: del in_dict[k] return in_dict
def test_environment_request(self): now = parse_datetime('2018-03-09T01:00:00Z') project = self.create_project() env = self.create_environment(project=project, name="prod") dts = [now + timedelta(hours=i) for i in range(4)] with responses.RequestsMock() as rsps: def snuba_response(request): body = json.loads(request.body) assert body['aggregations'] == [['count()', None, 'aggregate']] assert body['project'] == [project.id] assert body['groupby'] == ['project_id', 'time'] assert ['environment', 'IN', ['prod']] in body['conditions'] return (200, {}, json.dumps({ 'data': [{'project_id': project.id, 'time': '2018-03-09T01:00:00Z', 'aggregate': 100}], 'meta': [{'name': 'project_id'}, {'name': 'time'}, {'name': 'aggregate'}] })) rsps.add_callback( responses.POST, settings.SENTRY_SNUBA + '/query', callback=snuba_response) results = self.db.get_range(TSDBModel.project, [project.id], dts[0], dts[-1], environment_id=env.id, rollup=3600) assert results == { project.id: [ (int(to_timestamp(d)), 100 if d == now else 0) for d in dts] }
def strings_to_dates(model, dictionary): """Returns a new dictionary with all the mappings of `dictionary` but with date strings mapped to :class:`datetime.datetime` objects. The keys of `dictionary` are names of fields in the model specified in the constructor of this class. The values are values to set on these fields. If a field name corresponds to a field in the model which is a :class:`sqlalchemy.types.Date` or :class:`sqlalchemy.types.DateTime`, then the returned dictionary will have the corresponding :class:`datetime.datetime` Python object as the value of that mapping in place of the string. This function outputs a new dictionary; it does not modify the argument. """ result = {} for fieldname, value in dictionary.iteritems(): if is_date_field(model, fieldname) and value is not None: if value.strip() == '': result[fieldname] = None else: result[fieldname] = parse_datetime(value) else: result[fieldname] = value return result
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV or tab file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # do we have an uploaded file? if "option-data_upload" not in request.files: raise QueryParametersException("No file was offered for upload.") file = request.files["option-data_upload"] if not file: raise QueryParametersException("No file was offered for upload.") encoding = sniff_encoding(file) wrapped_file = io.TextIOWrapper(file, encoding=encoding) sample = wrapped_file.read(1024 * 1024) wrapped_file.seek(0) dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t")) # With validated csvs, save as is but make sure the raw file is sorted reader = csv.DictReader(wrapped_file, dialect=dialect) try: fields = reader.fieldnames except UnicodeDecodeError: raise QueryParametersException( "Uploaded file is not a well-formed CSV or TAB file.") # check if all required fields are present required = ("id", "thread_id", "subject", "author", "body", "timestamp") missing = [] for field in required: if field not in reader.fieldnames: missing.append(field) if missing: raise QueryParametersException( "The following required columns are not present in the csv file: %s" % ", ".join(missing)) try: row = reader.__next__() try: parse_datetime(row["timestamp"]) except ValueError: raise QueryParametersException( "Your 'timestamp' column does not use a recognisable format (yyyy-mm-dd hh:mm:ss is recommended)" ) except StopIteration: pass wrapped_file.detach() # Whether to strip the HTML tags strip_html = False if query.get("strip_html"): strip_html = True # return metadata - the filename is sanitised and serves no purpose at # this point in time, but can be used to uniquely identify a dataset disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") return { "filename": disallowed_characters.sub("", file.filename), "time": time.time(), "datasource": "custom", "board": "upload", "strip_html": strip_html }
def test_result_shape(self): """ Tests that the results from the different TSDB methods have the expected format. """ now = parse_datetime('2018-03-09T01:00:00Z') project_id = 194503 dts = [now + timedelta(hours=i) for i in range(4)] with responses.RequestsMock() as rsps: def snuba_response(request): body = json.loads(request.body) aggs = body.get('aggregations', []) meta = [{ 'name': col } for col in body['groupby'] + [a[2] for a in aggs]] datum = {col['name']: 1 for col in meta} datum['project_id'] = project_id if 'time' in datum: datum['time'] = '2018-03-09T01:00:00Z' for agg in aggs: if agg[0].startswith('topK'): datum[agg[2]] = [99] return (200, {}, json.dumps({'data': [datum], 'meta': meta})) rsps.add_callback(responses.POST, settings.SENTRY_SNUBA + '/query', callback=snuba_response) results = self.db.get_most_frequent( TSDBModel.frequent_issues_by_project, [project_id], dts[0], dts[0]) assert has_shape(results, {1: [(1, 1.0)]}) results = self.db.get_most_frequent_series( TSDBModel.frequent_issues_by_project, [project_id], dts[0], dts[0]) assert has_shape(results, {1: [(1, {1: 1.0})]}) items = { project_id: (0, 1, 2) # {project_id: (issue_id, issue_id, ...)} } results = self.db.get_frequency_series( TSDBModel.frequent_issues_by_project, items, dts[0], dts[-1]) assert has_shape(results, {1: [(1, {1: 1})]}) results = self.db.get_frequency_totals( TSDBModel.frequent_issues_by_project, items, dts[0], dts[-1]) assert has_shape(results, {1: {1: 1}}) results = self.db.get_range(TSDBModel.project, [project_id], dts[0], dts[-1]) assert has_shape(results, {1: [(1, 1)]}) results = self.db.get_distinct_counts_series( TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]) assert has_shape(results, {1: [(1, 1)]}) results = self.db.get_distinct_counts_totals( TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]) assert has_shape(results, {1: 1}) results = self.db.get_distinct_counts_union( TSDBModel.users_affected_by_project, [project_id], dts[0], dts[-1]) assert has_shape(results, 1)
def get_group_tag_keys_and_top_values( self, project_id, group_id, environment_ids, user=None, keys=None, value_limit=TOP_VALUES_DEFAULT_LIMIT, **kwargs ): # Similar to __get_tag_key_and_top_values except we get the top values # for all the keys provided. value_limit in this case means the number # of top values for each key, so the total rows returned should be # num_keys * limit. # First get totals and unique counts by key. keys_with_counts = self.get_group_tag_keys(project_id, group_id, environment_ids, keys=keys) # Then get the top values with first_seen/last_seen/count for each filters = {"project_id": get_project_list(project_id)} if environment_ids: filters["environment"] = environment_ids if keys is not None: filters["tags_key"] = keys if group_id is not None: filters["group_id"] = [group_id] conditions = kwargs.get("conditions", []) aggregations = kwargs.get("aggregations", []) aggregations += [ ["count()", "", "count"], ["min", SEEN_COLUMN, "first_seen"], ["max", SEEN_COLUMN, "last_seen"], ] values_by_key = snuba.query( start=kwargs.get("start"), end=kwargs.get("end"), groupby=["tags_key", "tags_value"], conditions=conditions, filter_keys=filters, aggregations=aggregations, orderby="-count", limitby=[value_limit, "tags_key"], referrer="tagstore.__get_tag_keys_and_top_values", ) # Then supplement the key objects with the top values for each. if group_id is None: value_ctor = TagValue else: value_ctor = functools.partial(GroupTagValue, group_id=group_id) for keyobj in keys_with_counts: key = keyobj.key values = values_by_key.get(key, []) keyobj.top_values = [ value_ctor( key=keyobj.key, value=value, times_seen=data["count"], first_seen=parse_datetime(data["first_seen"]), last_seen=parse_datetime(data["last_seen"]), ) for value, data in six.iteritems(values) ] return keys_with_counts
def get_metadata(arxiv): """Get metadata about an arxiv publication from website. Scrapes the arXiv webpage corresponding to the paper with the `arxiv` identifier and return the metadata for the paper in a dictionary. Parameters ---------- arxiv : str ArXiv identifier. Returns ------- metadata : dict Dictionary with metadata. Notes ----- This function queries arXiv. It must not be used to crawl arXiv. It does not look at robots.txt. This function currently uses 'abs' HTML pages and not the arXiv API or https://arxiv.org/help/oa/index which is the approved way. References ---------- - https://arxiv.org - https://arxiv.org/help/robots Examples -------- >>> metadata = get_metadata('1503.00759') >>> metadata['doi'] == '10.1109/JPROC.2015.2483592' True """ arxiv = arxiv.strip() url = 'https://arxiv.org/abs/' + arxiv headers = {'User-agent': USER_AGENT} response = requests.get(url, headers=headers) tree = etree.HTML(response.content) submissions = tree.xpath('//div[@class="submission-history"]/text()') datetime_as_string = submissions[-1][5:30] isodatetime = parse_datetime(datetime_as_string).isoformat() subjects = tree.xpath('//td[@class="tablecell subjects"]/span/text()' '|' '//td[@class="tablecell subjects"]/text()') arxiv_classifications = [ match for subject in subjects for match in re.findall(r'\((.*?)\)', subject) ] metadata = { 'arxiv': arxiv, 'authornames': tree.xpath('//div[@class="authors"]/a/text()'), 'full_text_url': 'https://arxiv.org/pdf/' + arxiv + '.pdf', 'publication_date': isodatetime[:10], 'title': re.sub(r'\s+', ' ', tree.xpath('//h1/text()')[-1].strip()), 'arxiv_classifications': arxiv_classifications, } # Optional DOI doi = tree.xpath('//td[@class="tablecell doi"]/a/text()') if not doi: doi = tree.xpath('//td[@class="tablecell msc_classes"]/a/text()') if doi: metadata['doi'] = doi[0] return metadata
def _create_ludwig_dataframe(self, mode): has_heavy_data = False col_map = {} if mode == 'train': df = self.transaction.input_data.train_df elif mode == 'predict': df = self.transaction.input_data.data_frame elif mode == 'validate': df = self.transaction.input_data.validation_df elif mode == 'test': df = self.transaction.input_data.test_df else: raise Exception(f'Unknown mode specified: "{mode}"') model_definition = {'input_features': [], 'output_features': []} data = {} if self.transaction.lmd['model_order_by'] is None: timeseries_cols = [] else: timeseries_cols = list( map(lambda x: x[0], self.transaction.lmd['model_order_by'])) for col in df.columns: tf_col = get_tensorflow_colname(col) col_map[tf_col] = col # Handle malformed columns if col in self.transaction.lmd['columns_to_ignore']: continue data[tf_col] = [] col_stats = self.transaction.lmd['column_stats'][col] data_subtype = col_stats['data_subtype'] ludwig_dtype = None encoder = None cell_type = None in_memory = None height = None width = None if col in timeseries_cols: encoder = 'rnn' cell_type = 'rnn' ludwig_dtype = 'order_by_col' if data_subtype in DATA_SUBTYPES.ARRAY: encoder = 'rnn' cell_type = 'rnn' ludwig_dtype = 'sequence' elif data_subtype in (DATA_SUBTYPES.INT, DATA_SUBTYPES.FLOAT): ludwig_dtype = 'numerical' elif data_subtype in (DATA_SUBTYPES.BINARY): ludwig_dtype = 'category' elif data_subtype in (DATA_SUBTYPES.DATE): if col not in self.transaction.lmd['predict_columns']: ludwig_dtype = 'date' else: ludwig_dtype = 'category' elif data_subtype in (DATA_SUBTYPES.TIMESTAMP): ludwig_dtype = 'numerical' elif data_subtype in (DATA_SUBTYPES.SINGLE, DATA_SUBTYPES.MULTIPLE): ludwig_dtype = 'category' elif data_subtype in (DATA_SUBTYPES.IMAGE): has_heavy_data = True ludwig_dtype = 'image' encoder = 'stacked_cnn' in_memory = True height = 256 width = 256 elif data_subtype in (DATA_SUBTYPES.TEXT): ludwig_dtype = 'text' else: # @TODO Maybe regress to some other similar subtype or use the principal data type for certain values self.transaction.log.error( f'The Ludwig backend doesn\'t support the "{data_subtype}" data type !' ) estr = f'Data subtype "{data_subtype}" no supported by Ludwig model backend' raise Exception(estr) custom_logic_continue = False for index, row in df.iterrows(): if ludwig_dtype == 'order_by_col': ts_data_point = row[col] try: ts_data_point = float(ts_data_point) except: ts_data_point = parse_datetime( ts_data_point).timestamp() data[tf_col].append(ts_data_point) elif ludwig_dtype == 'sequence': arr_str = row[col] if arr_str is not None: arr = list( map( float, arr_str.rstrip(']').lstrip('[').split( self.transaction.lmd['column_stats'][col] ['separator']))) else: arr = '' data[tf_col].append(arr) # Date isn't supported yet, so we hack around it elif ludwig_dtype == 'date': if col in data: data.pop(col) data[tf_col + '_year'] = [] data[tf_col + '_month'] = [] data[tf_col + '_day'] = [] model_definition['input_features'].append({ 'name': col + '_year', 'type': 'category' }) model_definition['input_features'].append({ 'name': col + '_month', 'type': 'category' }) model_definition['input_features'].append({ 'name': col + '_day', 'type': 'numerical' }) date = parse_datetime(row[col]) data[tf_col + '_year'].append(date.year) data[tf_col + '_month'].append(date.month) data[tf_col + '_day'].append(date.day) custom_logic_continue = True if col in timeseries_cols: timeseries_cols.remove(col) timeseries_cols.append(col + '_day') timeseries_cols.append(col + '_month') timeseries_cols.append(col + '_year') elif data_subtype in (DATA_SUBTYPES.TIMESTAMP): if row[col] is None: unix_ts = 0 else: unix_ts = parse_datetime(row[col]).timestamp() data[tf_col].append(unix_ts) elif data_subtype in (DATA_SUBTYPES.FLOAT): if type(row[col]) == str: data[tf_col].append( float(str(row[col]).replace(',', '.'))) else: data[tf_col].append(row[col]) elif data_subtype in (DATA_SUBTYPES.INT): if type(row[col]) == str: data[tf_col].append( round(float(str(row[col]).replace(',', '.')))) else: data[tf_col].append(row[col]) elif data_subtype in (DATA_SUBTYPES.IMAGE): if os.path.isabs(row[col]): data[tf_col].append(row[col]) else: data[tf_col].append(os.path.join( os.getcwd(), row[col])) else: data[tf_col].append(row[col]) if custom_logic_continue: continue if col not in self.transaction.lmd['predict_columns']: input_def = {'name': tf_col, 'type': ludwig_dtype} if encoder is not None: input_def['encoder'] = encoder if cell_type is not None: input_def['cell_type'] = cell_type if in_memory is not None: input_def['in_memory'] = in_memory if height is not None and width is not None: input_def['height'] = height input_def['width'] = width input_def['resize_image'] = True input_def['resize_method'] = 'crop_or_pad' model_definition['preprocessing'] = { 'image': { 'height': height, 'width': width, 'resize_image': True, 'resize_method': 'crop_or_pad', 'num_channels': 3 } } model_definition['input_features'].append(input_def) else: output_def = {'name': tf_col, 'type': ludwig_dtype} model_definition['output_features'].append(output_def) df = pd.DataFrame(data=data) if len(timeseries_cols) > 0: df.sort_values(timeseries_cols) return df, model_definition, timeseries_cols, has_heavy_data, col_map
def parse_date(date_str): return parse_datetime(date_str) if date_str is not None else None
def deserialize(self, text): if text is not None: return parse_datetime(text, fuzzy=False)
def to_python(obj, in_dict, str_keys=None, date_keys=None, int_keys=None, float_keys=None, object_map=None, bool_keys=None, dict_keys=None, **kwargs): """Extends a given object for API Consumption. :param obj: Object to extend. :param in_dict: Dict to extract data from. :param string_keys: List of in_dict keys that will be extracted as strings. :param date_keys: List of in_dict keys that will be extrad as datetimes. :param object_map: Dict of {key, obj} map, for nested object results. """ d = dict() if str_keys: for in_key in str_keys: d[in_key] = in_dict.get(in_key) if date_keys: for in_key in date_keys: in_date = in_dict.get(in_key) try: out_date = parse_datetime(in_date) except Exception as e: #raise e out_date = None d[in_key] = out_date if int_keys: for in_key in int_keys: if (in_dict is not None) and (in_dict.get(in_key) is not None): d[in_key] = int(in_dict.get(in_key)) if float_keys: for in_key in float_keys: if (in_dict is not None) and (in_dict.get(in_key) is not None): d[in_key] = float(in_dict.get(in_key)) if bool_keys: for in_key in bool_keys: if in_dict.get(in_key) is not None: d[in_key] = bool(in_dict.get(in_key)) if dict_keys: for in_key in dict_keys: if in_dict.get(in_key) is not None: d[in_key] = dict(in_dict.get(in_key)) if object_map: for (k, v) in object_map.items(): if in_dict.get(k): d[k] = v.new_from_dict(in_dict.get(k)) obj.__dict__.update(d) obj.__dict__.update(kwargs) # Save the dictionary, for write comparisons. # obj._cache = d # obj.__cache = in_dict return obj
def parse_snuba_datetime(value): """Parses a datetime value from snuba.""" return parse_datetime(value)
self.relationship_id = self.relationship_id self.relationship['cwe_id'] = self.relationship_id # make parser parser = make_parser() ch = CWEHandler() parser.setContentHandler(ch) # check modification date try: (f, r) = Configuration.getFeedData('cwe') except Exception as e: print(e) sys.exit("Cannot open url %s. Bad URL or not connected to the internet?" % (Configuration.getFeedURL("cwe"))) lastmodified = parse_datetime(r.headers['last-modified'], ignoretz=True) i = db.getLastModified('cwe') if i is not None and not args.f: if lastmodified == i: print("Not modified") sys.exit(0) # parse xml and store in database parser.parse(f) cweList = [] for cwe in progressbar(ch.cwe): cwe['Description'] = cwe['Description'].replace("\t\t\t\t\t", " ") if args.v: print(cwe) cweList.append(cwe)
def get_point_values(d, date_trunc=DEFAULT_RES, value_func='avg', trange=DEFAULT_RANGE, ts_as_datetime=False): conn = get_crate_connection() cursor = conn.cursor() # validate the date_trunc if date_trunc not in ['day', 'hour', 'minute', 'second']: date_trunc = DEFAULT_RES # use different queries for Number type sensors is_number = 'Number' == d.kind is_bool = 'Bool' == d.kind if not trange: trange = DEFAULT_RANGE # convert the range end = datetime.utcnow() start = datetime.utcnow() if 'today' == trange: start -= timedelta(days=1) elif 'yesterday' == trange: start -= timedelta(days=2) end -= timedelta(days=1) elif len(trange) > 7 and ' months' == trange[-7:]: start -= timedelta(days=30 * int(trange[:-7])) elif len(trange) > 6 and ' month' == trange[-6:]: start -= timedelta(days=30 * int(trange[:-6])) elif len(trange) > 5 and ' days' == trange[-5:]: start -= timedelta(days=int(trange[:-5])) elif len(trange) > 1 and 'h' == trange[-1:]: start -= timedelta(hours=int(trange[:-1])) elif len(trange) > 1 and 'm' == trange[-1:]: start -= timedelta(minutes=int(trange[:-1])) else: # can be given as <date> or <date>,<date> dr = trange.split(',') start = parse_datetime(dr[0]) if len(dr) > 1: end = parse_datetime(dr[1]) logger.info("Getting data points for range %s -- %s", start, end) if is_number: sql = """SELECT DATE_TRUNC('{}', ts) as timest, {}(double_value) FROM "volttron"."data" WHERE topic = ? AND ts > ? AND ts <= ? GROUP BY timest ORDER BY timest DESC;""".format( date_trunc, value_func) elif is_bool: # use MIN as function since we query string_value sql = """SELECT DATE_TRUNC('{}', ts) as timest, MIN(string_value), {}(double_value) FROM "volttron"."data" WHERE topic = ? AND ts > ? AND ts <= ? GROUP BY timest ORDER BY timest DESC;""".format( date_trunc, value_func) else: sql = """SELECT ts, string_value FROM "volttron"."data" WHERE topic = ? AND ts > ? AND ts <= ? ORDER BY ts DESC;""" cursor.execute(sql, ( d.topic, start, end, )) data = [] while True: result = cursor.fetchone() if result is None: break ts = result[0] value = result[1] if is_bool: if value and (value == 't' or value != '0'): value = 1 else: value = round(result[2]) if ts_as_datetime: # convert from epoch to datetime directly ts = datetime.utcfromtimestamp( ts // 1000).replace(microsecond=0).replace(tzinfo=timezone.utc) data.append([ts, value]) logger.info("Got %s data points for %s", len(data), d.entity_id) cursor.close() conn.close() return list(reversed(data))
def filter_logic(f, limit, skip): query = [] # retrieving lists if f['blacklistSelect'] == "on": regexes = db.getRules('blacklist') if len(regexes) != 0: exp = "^(?!" + "|".join(regexes) + ")" query.append({ '$or': [{ 'vulnerable_configuration': re.compile(exp) }, { 'vulnerable_configuration': { '$exists': False } }, { 'vulnerable_configuration': [] }] }) if f['whitelistSelect'] == "hide": regexes = db.getRules('whitelist') if len(regexes) != 0: exp = "^(?!" + "|".join(regexes) + ")" query.append({ '$or': [{ 'vulnerable_configuration': re.compile(exp) }, { 'vulnerable_configuration': { '$exists': False } }, { 'vulnerable_configuration': [] }] }) if f['unlistedSelect'] == "hide": wlregexes = compile(db.getRules('whitelist')) blregexes = compile(db.getRules('blacklist')) query.append({ '$or': [{ 'vulnerable_configuration': { '$in': wlregexes } }, { 'vulnerable_configuration': { '$in': blregexes } }] }) if f['rejectedSelect'] == "hide": exp = "^(?!\*\* REJECT \*\*\s+DO NOT USE THIS CANDIDATE NUMBER.*)" query.append({'summary': re.compile(exp)}) # plugin filters query.extend(plugManager.doFilter(f, **pluginArgs())) # cvss logic if f['cvssSelect'] == "above": query.append({'cvss': {'$gt': float(f['cvss'])}}) elif f['cvssSelect'] == "equals": query.append({'cvss': float(f['cvss'])}) elif f['cvssSelect'] == "below": query.append({'cvss': {'$lt': float(f['cvss'])}}) # date logic if f['timeSelect'] != "all": startDate = parse_datetime(f['startDate'], ignoretz=True, dayfirst=True) endDate = parse_datetime(f['endDate'], ignoretz=True, dayfirst=True) if f['timeSelect'] == "from": query.append({f['timeTypeSelect']: {'$gt': startDate}}) if f['timeSelect'] == "until": query.append({f['timeTypeSelect']: {'$lt': endDate}}) if f['timeSelect'] == "between": query.append( {f['timeTypeSelect']: { '$gt': startDate, '$lt': endDate }}) if f['timeSelect'] == "outside": query.append({ '$or': [{ f['timeTypeSelect']: { '$lt': startDate } }, { f['timeTypeSelect']: { '$gt': endDate } }] }) cve = db.getCVEs(limit=limit, skip=skip, query=query) # marking relevant records if f['whitelistSelect'] == "on": cve = whitelist_mark(cve) if f['blacklistSelect'] == "mark": cve = blacklist_mark(cve) plugManager.mark(cve, **pluginArgs()) cve = list(cve) return cve
def after_create(query, dataset, request): """ Hook to execute after the dataset for this source has been created In this case, it is used to save the uploaded file to the dataset's result path, and finalise the dataset metadata. :param dict query: Sanitised query parameters :param DataSet dataset: Dataset created for this query :param request: Flask request submitted for its creation """ strip_html = query.get("strip_html") file = request.files["option-data_upload"] file.seek(0) # detect encoding - UTF-8 with or without BOM encoding = sniff_encoding(file) wrapped_file = io.TextIOWrapper(file, encoding=encoding) sample = wrapped_file.read(1024 * 1024) wrapped_file.seek(0) dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t")) # With validated csvs, save as is but make sure the raw file is sorted reader = csv.DictReader(wrapped_file, dialect=dialect) with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: # Sort by timestamp # note that this relies on the timestamp format to be sortable # but the alternative - first converting timestamps and then # sorting - would be quite intensive dataset.update_status("Sorting file by date") sorted_reader = sorted( reader, key=lambda row: row["timestamp"] if isinstance(row["timestamp"], str) else "") dataset.update_status("Writing to file") fieldnames = list(reader.fieldnames) if "unix_timestamp" not in fieldnames: fieldnames.append("unix_timestamp") writer = csv.DictWriter(output_csv, fieldnames=fieldnames) writer.writeheader() for row in sorted_reader: try: sanitised_time = parse_datetime(row["timestamp"]) row["timestamp"] = sanitised_time.strftime( "%Y-%m-%d %H:%I:%S") row["unix_timestamp"] = sanitised_time.timestamp() except (TypeError, ValueError): # bad format, skip continue if strip_html: row["body"] = strip_tags(row["body"]) writer.writerow(row) file.close() with dataset.get_results_path().open(encoding="utf-8") as input: if file.filename.endswith(".tab"): reader = csv.DictReader(input, delimiter="\t", quoting=csv.QUOTE_NONE) else: reader = csv.DictReader(input) dataset.finish(sum(1 for line in reader)) dataset.update_status("Result processed") dataset.update_version(get_software_version())
def to_python( obj, in_dict, str_keys=None, date_keys=None, int_keys=None, real_keys=None, object_map=None, array_map=None, bool_keys=None, dict_keys=None, enums=None, **kwargs): """Extends a given object for API Consumption. :param obj: Object to extend. :param in_dict: Dict to extract data from. :param string_keys: List of in_dict keys that will be extracted as strings. :param date_keys: List of in_dict keys that will be extrad as datetimes. :param object_map: Dict of {key, obj} map, for nested object results. :param array_map: Dict of {key, obj} map, for nested object results. :param bool_keys: List of in_dict keys that will be extracted as bools :param dict_keys: Dict of {key, obj} map, for nested object results. :param enums: Dict of {key, `Enum`} map, that will be extracted as `Enum` """ d = dict() if str_keys: for in_key in str_keys: private_name = '_' + in_key d[private_name] = in_dict.get(in_key) if date_keys: for in_key in date_keys: private_name = '_' + in_key in_date = in_dict.get(in_key) if in_date is not None: try: out_date = parse_datetime(in_date) except Exception as e: # if there is no date we fallback to None, exception is not required here ##raise ZangException(e) out_date = None d[private_name] = out_date else: d[private_name] = None if int_keys: for in_key in int_keys: private_name = '_' + in_key in_int = in_dict.get(in_key) if in_int is not None and in_int != '': try: out_int = int(in_int) except ValueError as e: raise ZangException(e) out_int = None d[private_name] = out_int else: d[private_name] = None if real_keys: for in_key in real_keys: if (in_dict is not None) and (in_dict.get(in_key) is not None): private_name = '_' + in_key d[private_name] = float(in_dict.get(in_key)) if bool_keys: for in_key in bool_keys: if in_dict.get(in_key) is not None: private_name = '_' + in_key value = in_dict.get(in_key) if isinstance(value, str_classes): value = value.lower() == 'true' d[private_name] = value # ENUMS if enums: for in_key in enums: class_ = enums[in_key] if (in_dict is not None) and (in_dict.get(in_key) is not None): private_name = '_' + in_key dictValue = in_dict.get(in_key) # code below smells, need to clean it up try: value = class_(dictValue) except Exception as e: try: value = class_(dictValue.title()) except Exception as e: value = class_(dictValue.capitalize()) d[private_name] = value # LISTS if dict_keys: for in_key in dict_keys: if in_dict.get(in_key) is not None: private_name = '_' + in_key d[private_name] = dict(in_dict.get(in_key)) if object_map: for (k, v) in object_map.items(): if in_dict.get(k): private_name = '_' + k d[private_name] = v.new_from_dict(in_dict.get(k)) if array_map: for (k, v) in array_map.items(): if in_dict.get(k): private_name = '_' + k d[private_name] = [v.new_from_dict(i) for i in in_dict.get(k)] obj.__dict__.update(d) obj.__dict__.update(kwargs) # Save the dictionary, for write comparisons. # obj._cache = d # obj.__cache = in_dict return obj
def execute(self, ecosystem, bucket_name, object_key, from_date=None, to_date=None): """Aggregate gathered topics and store them on S3. :param ecosystem: ecosystem name for which topics should be gathered :param bucket_name: name of the destination bucket to which topics should be stored :param object_key: name of the object under which aggregated topics should be stored :param from_date: date limitation for task result queries :param to_date: date limitation for taks result queries """ if from_date is not None: from_date = parse_datetime(from_date) if to_date is not None: to_date = parse_datetime(to_date) s3 = StoragePool.get_connected_storage('S3Data') postgres = StoragePool.get_connected_storage('PackagePostgres') base_query = postgres.session.query(WorkerResult).\ join(Analysis). \ join(Version).\ join(Package).\ join(Ecosystem).\ filter(WorkerResult.error.is_(False)).\ filter(WorkerResult.worker == 'github_details').\ filter(Ecosystem.name == ecosystem) if from_date is not None: base_query = base_query.filter(Analysis.started_at > from_date).\ order_by(desc(WorkerResult.id)) if to_date is not None: base_query = base_query.filter(Analysis.started_at < to_date).\ order_by(desc(WorkerResult.id)) start = 0 topics = [] while True: try: results = base_query.slice(start, start + 10).all() except SQLAlchemyError: postgres.session.rollback() raise if not results: break self.log.info("Collecting topics, slice offset is %s", start) start += 10 for entry in results: name = entry.package.name version = entry.version.identifier self.log.debug("Aggregating topics for %s/%s/%s", ecosystem, name, version) task_result = entry.task_result if not postgres.is_real_task_result(task_result): self.log.debug( "Result was already stored on S3, retrieving from there" ) try: task_result = s3.retrieve_task_result( ecosystem, name, version, 'github_details') except: self.log.exception( "Failed to retrieve result 'github_details' from S3 " "for %s/%s/%s", ecosystem, name, version) continue topics.append({ 'topics': task_result.get('details', {}).get('topics'), 'name': name, 'ecosystem': ecosystem, 'version': version }) report = { 'ecosystem': ecosystem, 'bucket_name': bucket_name, 'object_key': object_key, 'from_date': str(from_date), 'to_date': str(to_date), 'result': topics } self._store_topics(bucket_name, object_key, report)
def process_cve_item(item=None): if item is None: return None cve = {} cve['id'] = item['cve']['CVE_data_meta']['ID'] cve['assigner'] = item['cve']['CVE_data_meta']['ASSIGNER'] cve['Published'] = parse_datetime(item['publishedDate'], ignoretz=True) cve['Modified'] = parse_datetime(item['lastModifiedDate'], ignoretz=True) for description in item['cve']['description']['description_data']: if description['lang'] == 'en': if "summary" in cve: cve['summary'] += " {}".format(description['value']) else: cve['summary'] = description['value'] if 'impact' in item: cve['access'] = {} cve['impact'] = {} if 'baseMetricV2' in item['impact']: cve['access']['authentication'] = item['impact']['baseMetricV2'][ 'cvssV2']['authentication'] cve['access']['complexity'] = item['impact']['baseMetricV2'][ 'cvssV2']['accessComplexity'] cve['access']['vector'] = item['impact']['baseMetricV2']['cvssV2'][ 'accessVector'] cve['impact']['availability'] = item['impact']['baseMetricV2'][ 'cvssV2']['availabilityImpact'] cve['impact']['confidentiality'] = item['impact']['baseMetricV2'][ 'cvssV2']['confidentialityImpact'] cve['impact']['integrity'] = item['impact']['baseMetricV2'][ 'cvssV2']['integrityImpact'] cve['cvss'] = float( item['impact']['baseMetricV2']['cvssV2']['baseScore']) cve['cvss-time'] = parse_datetime( item['lastModifiedDate'], ignoretz=True ) # NVD JSON lacks the CVSS time which was present in the original XML format cve['cvss-vector'] = item['impact']['baseMetricV2']['cvssV2'][ 'vectorString'] else: cve['cvss'] = float(5) if 'references' in item['cve']: cve['references'] = [] for ref in item['cve']['references']['reference_data']: cve['references'].append(ref['url']) if 'configurations' in item: cve['vulnerable_configuration'] = [] cve['vulnerable_product'] = [] for cpe in item['configurations']['nodes']: if 'cpe_match' in cpe: for cpeuri in cpe['cpe_match']: if cpeuri['vulnerable']: query, version_info = get_cpe_info(cpeuri) if query != {}: query["id"] = hashlib.sha1( cpeuri["cpe23Uri"].encode("utf-8") + version_info.encode("utf-8")).hexdigest() cpe_info = db.getCPEVersionInformation(query) if cpe_info: if cpe_info["cpe_name"]: for vulnerable_version in cpe_info[ "cpe_name"]: cve = add_if_missing( cve, "vulnerable_product", vulnerable_version["cpe23Uri"]) cve = add_if_missing( cve, "vulnerable_configuration", vulnerable_version["cpe23Uri"]) else: cve = add_if_missing( cve, "vulnerable_product", cpeuri["cpe23Uri"]) cve = add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) else: # If the cpe_match did not have any of the version start/end modifiers, # add the CPE string as it is. cve = add_if_missing(cve, "vulnerable_product", cpeuri["cpe23Uri"]) cve = add_if_missing(cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) else: cve = add_if_missing(cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) if 'children' in cpe: for child in cpe['children']: if 'cpe_match' in child: for cpeuri in child['cpe_match']: if cpeuri['vulnerable']: query, version_info = get_cpe_info(cpeuri) if query != {}: query["id"] = hashlib.sha1( cpeuri["cpe23Uri"].encode("utf-8") + version_info.encode("utf-8") ).hexdigest() cpe_info = db.getCPEVersionInformation( query) if cpe_info: if cpe_info["cpe_name"]: for vulnerable_version in cpe_info[ "cpe_name"]: cve = add_if_missing( cve, "vulnerable_product", vulnerable_version[ "cpe23Uri"]) cve = add_if_missing( cve, "vulnerable_configuration", vulnerable_version[ "cpe23Uri"]) else: cve = add_if_missing( cve, "vulnerable_product", cpeuri["cpe23Uri"]) cve = add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) else: # If the cpe_match did not have any of the version start/end modifiers, # add the CPE string as it is. cve = add_if_missing( cve, "vulnerable_product", cpeuri["cpe23Uri"]) cve = add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) else: cve = add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) if 'problemtype' in item['cve']: for problem in item['cve']['problemtype']['problemtype_data']: for cwe in problem[ 'description']: # NVD JSON not clear if we can get more than one CWE per CVE (until we take the last one) - NVD-CWE-Other??? list? if cwe['lang'] == 'en': cve['cwe'] = cwe['value'] if not ('cwe' in cve): cve['cwe'] = defaultvalue['cwe'] else: cve['cwe'] = defaultvalue['cwe'] cve['vulnerable_configuration_cpe_2_2'] = [] return cve
def process_cve_item(self, item=None): if item is None: return None if "ASSIGNER" not in item["cve"]["CVE_data_meta"]: item["cve"]["CVE_data_meta"]["ASSIGNER"] = None cve = { "id": item["cve"]["CVE_data_meta"]["ID"], "assigner": item["cve"]["CVE_data_meta"]["ASSIGNER"], "Published": parse_datetime(item["publishedDate"], ignoretz=True), "Modified": parse_datetime(item["lastModifiedDate"], ignoretz=True), "last-modified": parse_datetime(item["lastModifiedDate"], ignoretz=True), } for description in item["cve"]["description"]["description_data"]: if description["lang"] == "en": if "summary" in cve: cve["summary"] += " {}".format(description["value"]) else: cve["summary"] = description["value"] if "impact" in item: cve["access"] = {} cve["impact"] = {} if "baseMetricV3" in item["impact"]: cve["impact3"] = {} cve["exploitability3"] = {} cve["impact3"]["availability"] = item["impact"][ "baseMetricV3"]["cvssV3"]["availabilityImpact"] cve["impact3"]["confidentiality"] = item["impact"][ "baseMetricV3"]["cvssV3"]["confidentialityImpact"] cve["impact3"]["integrity"] = item["impact"]["baseMetricV3"][ "cvssV3"]["integrityImpact"] cve["exploitability3"]["attackvector"] = item["impact"][ "baseMetricV3"]["cvssV3"]["attackVector"] cve["exploitability3"]["attackcomplexity"] = item["impact"][ "baseMetricV3"]["cvssV3"]["attackComplexity"] cve["exploitability3"]["privilegesrequired"] = item["impact"][ "baseMetricV3"]["cvssV3"]["privilegesRequired"] cve["exploitability3"]["userinteraction"] = item["impact"][ "baseMetricV3"]["cvssV3"]["userInteraction"] cve["exploitability3"]["scope"] = item["impact"][ "baseMetricV3"]["cvssV3"]["scope"] cve["cvss3"] = float( item["impact"]["baseMetricV3"]["cvssV3"]["baseScore"]) cve["cvss3-vector"] = item["impact"]["baseMetricV3"]["cvssV3"][ "vectorString"] cve["impactScore3"] = float( item["impact"]["baseMetricV3"]["impactScore"]) cve["exploitabilityScore3"] = float( item["impact"]["baseMetricV3"]["exploitabilityScore"]) else: cve["cvss3"] = None if "baseMetricV2" in item["impact"]: cve["access"]["authentication"] = item["impact"][ "baseMetricV2"]["cvssV2"]["authentication"] cve["access"]["complexity"] = item["impact"]["baseMetricV2"][ "cvssV2"]["accessComplexity"] cve["access"]["vector"] = item["impact"]["baseMetricV2"][ "cvssV2"]["accessVector"] cve["impact"]["availability"] = item["impact"]["baseMetricV2"][ "cvssV2"]["availabilityImpact"] cve["impact"]["confidentiality"] = item["impact"][ "baseMetricV2"]["cvssV2"]["confidentialityImpact"] cve["impact"]["integrity"] = item["impact"]["baseMetricV2"][ "cvssV2"]["integrityImpact"] cve["cvss"] = float( item["impact"]["baseMetricV2"]["cvssV2"]["baseScore"]) cve["exploitabilityScore"] = float( item["impact"]["baseMetricV2"]["exploitabilityScore"]) cve["impactScore"] = float( item["impact"]["baseMetricV2"]["impactScore"]) cve["cvss-time"] = parse_datetime( item["lastModifiedDate"], ignoretz=True ) # NVD JSON lacks the CVSS time which was present in the original XML format cve["cvss-vector"] = item["impact"]["baseMetricV2"]["cvssV2"][ "vectorString"] else: cve["cvss"] = None if "references" in item["cve"]: cve["references"] = [] for ref in item["cve"]["references"]["reference_data"]: cve["references"].append(ref["url"]) if "configurations" in item: cve["vulnerable_configuration"] = [] cve["vulnerable_product"] = [] cve["vendors"] = [] cve["products"] = [] cve["vulnerable_product_stems"] = [] cve["vulnerable_configuration_stems"] = [] for cpe in item["configurations"]["nodes"]: if "cpe_match" in cpe: for cpeuri in cpe["cpe_match"]: if "cpe23Uri" not in cpeuri: continue if cpeuri["vulnerable"]: query, version_info = self.get_cpe_info(cpeuri) if query != {}: query["id"] = hashlib.sha1( cpeuri["cpe23Uri"].encode("utf-8") + version_info.encode("utf-8")).hexdigest() cpe_info = getCPEVersionInformation(query) if cpe_info: if cpe_info["cpe_name"]: for vulnerable_version in cpe_info[ "cpe_name"]: cve = self.add_if_missing( cve, "vulnerable_product", vulnerable_version["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration", vulnerable_version["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem(vulnerable_version[ "cpe23Uri"]), ) vendor, product = self.get_vendor_product( vulnerable_version["cpe23Uri"]) cve = self.add_if_missing( cve, "vendors", vendor) cve = self.add_if_missing( cve, "products", product) cve = self.add_if_missing( cve, "vulnerable_product_stems", self.stem(vulnerable_version[ "cpe23Uri"]), ) else: cve = self.add_if_missing( cve, "vulnerable_product", cpeuri["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem(cpeuri["cpe23Uri"]), ) vendor, product = self.get_vendor_product( cpeuri["cpe23Uri"]) cve = self.add_if_missing( cve, "vendors", vendor) cve = self.add_if_missing( cve, "products", product) cve = self.add_if_missing( cve, "vulnerable_product_stems", self.stem(cpeuri["cpe23Uri"]), ) else: # If the cpe_match did not have any of the version start/end modifiers, # add the CPE string as it is. cve = self.add_if_missing( cve, "vulnerable_product", cpeuri["cpe23Uri"]) cve = self.add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem(cpeuri["cpe23Uri"]), ) vendor, product = self.get_vendor_product( cpeuri["cpe23Uri"]) cve = self.add_if_missing( cve, "vendors", vendor) cve = self.add_if_missing( cve, "products", product) cve = self.add_if_missing( cve, "vulnerable_product_stems", self.stem(cpeuri["cpe23Uri"]), ) else: cve = self.add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"]) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem(cpeuri["cpe23Uri"]), ) if "children" in cpe: for child in cpe["children"]: if "cpe_match" in child: for cpeuri in child["cpe_match"]: if "cpe23Uri" not in cpeuri: continue if cpeuri["vulnerable"]: query, version_info = self.get_cpe_info( cpeuri) if query != {}: query["id"] = hashlib.sha1( cpeuri["cpe23Uri"].encode("utf-8") + version_info.encode("utf-8") ).hexdigest() cpe_info = getCPEVersionInformation( query) if cpe_info: if cpe_info["cpe_name"]: for vulnerable_version in cpe_info[ "cpe_name"]: cve = self.add_if_missing( cve, "vulnerable_product", vulnerable_version[ "cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration", vulnerable_version[ "cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem( vulnerable_version[ "cpe23Uri"]), ) ( vendor, product, ) = self.get_vendor_product( vulnerable_version[ "cpe23Uri"]) cve = self.add_if_missing( cve, "vendors", vendor) cve = self.add_if_missing( cve, "products", product) cve = self.add_if_missing( cve, "vulnerable_product_stems", self.stem( vulnerable_version[ "cpe23Uri"]), ) else: cve = self.add_if_missing( cve, "vulnerable_product", cpeuri["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem( cpeuri["cpe23Uri"]), ) ( vendor, product, ) = self.get_vendor_product( cpeuri["cpe23Uri"]) cve = self.add_if_missing( cve, "vendors", vendor) cve = self.add_if_missing( cve, "products", product) cve = self.add_if_missing( cve, "vulnerable_product_stems", self.stem( cpeuri["cpe23Uri"]), ) else: # If the cpe_match did not have any of the version start/end modifiers, # add the CPE string as it is. if "cpe23Uri" not in cpeuri: continue cve = self.add_if_missing( cve, "vulnerable_product", cpeuri["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem(cpeuri["cpe23Uri"]), ) vendor, product = self.get_vendor_product( cpeuri["cpe23Uri"]) cve = self.add_if_missing( cve, "vendors", vendor) cve = self.add_if_missing( cve, "products", product) cve = self.add_if_missing( cve, "vulnerable_product_stems", self.stem(cpeuri["cpe23Uri"]), ) else: if "cpe23Uri" not in cpeuri: continue cve = self.add_if_missing( cve, "vulnerable_configuration", cpeuri["cpe23Uri"], ) cve = self.add_if_missing( cve, "vulnerable_configuration_stems", self.stem(cpeuri["cpe23Uri"]), ) if "problemtype" in item["cve"]: for problem in item["cve"]["problemtype"]["problemtype_data"]: for cwe in problem[ "description"]: # NVD JSON not clear if we can get more than one CWE per CVE (until we take the last one) - # NVD-CWE-Other??? list? if cwe["lang"] == "en": cve["cwe"] = cwe["value"] if not ("cwe" in cve): cve["cwe"] = defaultvalue["cwe"] else: cve["cwe"] = defaultvalue["cwe"] cve["vulnerable_configuration_cpe_2_2"] = [] return cve
def update_collections(): file_prefix = "nvdcve-2.0-" file_suffix = ".xml.gz" file_mod = "modified" file_rec = "recent" getfile = file_prefix + file_mod + file_suffix try: (f, r) = Configuration.getFile(Configuration.getFeedURL('cve') + getfile) except: sys.exit( "Cannot open url %s. Bad URL or not connected to the internet?" % (Configuration.getFeedURL("cve") + getfile)) i = cve_db.getInfo("cves") last_modified = parse_datetime(r.headers['last-modified'], ignoretz=True) if i is not None: if last_modified == i['last-modified']: logger.info("Not modified") return "Not modified" cve_db.setColUpdate("cves", last_modified) parser = make_parser() ch = CVEHandler() parser.setContentHandler(ch) parser.parse(f) for item in ch.cves: # check if the CVE already exists. x = cve_db.getCVE(item['id']) # if so, update the entry. if x: if 'cvss' not in item: item['cvss'] = None if 'cwe' not in item: item['cwe'] = defaultvalue['cwe'] cve_db.updateCVE(item) else: cve_db.insertCVE(item) # get the 'recent' file getfile = file_prefix + file_rec + file_suffix try: (f, r) = Configuration.getFile(Configuration.getFeedURL('cve') + getfile) except: sys.exit( "Cannot open url %s. Bad URL or not connected to the internet?" % (Configuration.getFeedURL("cve") + getfile)) parser = make_parser() ch = CVEHandler() parser.setContentHandler(ch) parser.parse(f) for item in progressbar(ch.cves): # check if the CVE already exists. x = cve_db.getCVE(item['id']) # if so, update the entry. if x: if args.v: logger.info("item found : " + item['id']) if 'cvss' not in item: item['cvss'] = None else: item['cvss'] = float(item['cvss']) if 'cwe' not in item: item['cwe'] = defaultvalue['cwe'] cve_db.updateCVE(item) # if not, create it. else: cve_db.insertCVE(item) return 'success'
class TomlTokenizer(object): PATTERNS = ( ('bool', re.compile('(true|false)'), lambda x: x == 'true'), ('comment', re.compile(r'(#[\s\S]*)'), lambda x: x[1:].strip()), ('id', re.compile(r'([_a-zA-Z][a-zA-Z0-9_]*)'), None), ('section', re.compile( r'(\[[_a-zA-Z][a-zA-Z0-9_]*(\.[_a-zA-Z][a-zA-Z0-9_]*)*\])'), lambda x: x[1:-1].strip()), ('string', re.compile('("[^"]*")'), lambda x: unescape(x[1:-1].strip())), ('whitespace', re.compile('(\s+)'), lambda x: None), ('literal', re.compile(r'([,\[\]=])'), None), ('datetime', re.compile( '(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}([-+]\d{2}:?\d{2}|Z))'), lambda x: parse_datetime(x)), ('float', re.compile('(\d+\.\d+)'), lambda x: float(x)), ('int', re.compile('(\d+)'), lambda x: int(x)), ) LOGGER_NAME = 'tomless.tokenizer' @classmethod def tokenize_line(cls, line, line_no): offset = 0 logger = logging.getLogger(cls.LOGGER_NAME) logger.debug('tokenlizing line {} {}'.format(line_no, line)) line = line.strip() while offset < len(line): current_offset = offset for t_type, pattern, processor in cls.PATTERNS: logger.debug('try pattern {} at offset {}'.format( t_type, offset)) match = pattern.match(line, offset) if not match: logger.debug('no match') continue content = match.group(0) content_length = len(content) val = processor(content) if processor else content if t_type != 'whitespace': yield TomlToken(val if t_type == 'literal' else t_type, val, line_no, offset) offset += content_length logger.debug('matched pattern {} {} ({})'.format( t_type, content, content_length)) if current_offset == offset: raise Exception('lex error at line {} {}: {}'.format( line_no, current_offset, line)) logger.debug('check eol:{} {} {}'.format(offset, len(line), line[offset:])) @classmethod def tokenize_content(cls, content): for i, line in enumerate(content.strip().splitlines()): for token in cls.tokenize_line(line, i + 1): yield token @classmethod def tokenize_file(cls, filename): content = '' with open(filename) as f: content = f.read() return list(cls.tokenize_content(content))
def get_snuba_translators(filter_keys, is_grouprelease=False): """ Some models are stored differently in snuba, eg. as the environment name instead of the the environment ID. Here we create and return forward() and reverse() translation functions that perform all the required changes. forward() is designed to work on the filter_keys and so should be called with a map of {column: [key1, key2], ...} and should return an updated map with the filter keys replaced with the ones that Snuba expects. reverse() is designed to work on result rows, so should be called with a row in the form {column: value, ...} and will return a translated result row. Because translation can potentially rely on combinations of different parts of the result row, I decided to implement them as composable functions over the row to be translated. This should make it simpler to add any other needed translations as long as you can express them as forward(filters) and reverse(row) functions. """ # Helper lambdas to compose translator functions identity = (lambda x: x) compose = (lambda f, g: lambda x: f(g(x))) replace = (lambda d, key, val: d.update({key: val}) or d) forward = identity reverse = identity map_columns = { 'environment': (Environment, 'name', lambda name: None if name == '' else name), 'tags[sentry:release]': (Release, 'version', identity), } for col, (model, field, fmt) in six.iteritems(map_columns): fwd, rev = None, None ids = filter_keys.get(col) if not ids: continue if is_grouprelease and col == "tags[sentry:release]": # GroupRelease -> Release translation is a special case because the # translation relies on both the Group and Release value in the result row. # # We create a map of {grouprelease_id: (group_id, version), ...} and the corresponding # reverse map of {(group_id, version): grouprelease_id, ...} # NB this does depend on `issue` being defined in the query result, and the correct # set of issues being resolved, which is outside the control of this function. gr_map = GroupRelease.objects.filter(id__in=ids).values_list( "id", "group_id", "release_id") ver = dict( Release.objects.filter(id__in=[x[2] for x in gr_map]).values_list( "id", "version")) fwd_map = { gr: (group, ver[release]) for (gr, group, release) in gr_map } rev_map = dict(reversed(t) for t in six.iteritems(fwd_map)) fwd = (lambda col, trans: lambda filters: replace( filters, col, [trans[k][1] for k in filters[col]]))(col, fwd_map) rev = ( lambda col, trans: lambda row: replace( # The translate map may not have every combination of issue/release # returned by the query. row, col, trans.get((row["issue"], row[col]))))(col, rev_map) else: fwd_map = { k: fmt(v) for k, v in model.objects.filter( id__in=ids).values_list("id", field) } rev_map = dict(reversed(t) for t in six.iteritems(fwd_map)) fwd = (lambda col, trans: lambda filters: replace( filters, col, [trans[k] for k in filters[col] if k]))(col, fwd_map) rev = (lambda col, trans: lambda row: replace( row, col, trans[row[col]]) if col in row else row)(col, rev_map) if fwd: forward = compose(forward, fwd) if rev: reverse = compose(reverse, rev) # Extra reverse translator for time column. reverse = compose( reverse, lambda row: replace(row, "time", int(to_timestamp(parse_datetime(row["time"])))) if "time" in row else row, ) return (forward, reverse)
def construct_package_query(cls, input_json): """Construct the query to retrieve detailed information of given package.""" # TODO: reduce cyclomatic complexity # see https://fabric8-analytics.github.io/dashboard/fabric8-analytics-data-model.cc.D.html # issue: https://github.com/fabric8-analytics/fabric8-analytics-data-model/issues/232 pkg_name = input_json.get('package') ecosystem = input_json.get('ecosystem') pkg_name_tokens = re.split(r'\W+', pkg_name) prp_package = "" drop_prop = "" drop_props = [] # TODO: refactor into the separate module str_package = "pkg = g.V().has('ecosystem','{ecosystem}').has('name', '{pkg_name}')." \ "tryNext().orElseGet{{g.V()." \ "has('vertex_label','Count').choose(has('{ecosystem}_pkg_count')," \ "sack(assign).by('{ecosystem}_pkg_count').sack(sum).by(constant(" \ "1)).property('{ecosystem}_pkg_count',sack())," \ "property('{ecosystem}_pkg_count',1)).iterate();" \ "graph.addVertex('ecosystem', '{ecosystem}', 'name', " \ "'{pkg_name}', 'vertex_label', 'Package'); }};" \ "pkg.property('last_updated', {last_updated});".format( ecosystem=ecosystem, pkg_name=pkg_name, last_updated=str(time.time())) cur_latest_ver, cur_libio_latest_ver = get_current_version( ecosystem, pkg_name) cur_date = (datetime.utcnow()).strftime('%Y%m%d') last_updated_flag = 'false' latest_version = cls.sanitize_text_for_query( input_json.get('latest_version')) if latest_version: # If latest version dont have cve, then it becomes the latest non cve version as well non_cve_ver = get_latest_version_non_cve(ecosystem, pkg_name, latest_version) prp_package += "pkg.property('latest_non_cve_version', '{}');".format( non_cve_ver) prp_package += "pkg.property('latest_version', '{}');".format( latest_version) if latest_version != cur_latest_ver: prp_package += "pkg.property('latest_version_last_updated', '{}');".format( cur_date) last_updated_flag = 'true' # Get Github Details if 'github_details' in input_json.get('analyses', {}): gh_details = input_json.get('analyses').get('github_details').get( 'details', {}) gh_prs_last_year_opened = str( gh_details.get('updated_pull_requests', {}).get('year', {}).get('opened', -1)) gh_prs_last_month_opened = str( gh_details.get('updated_pull_requests', {}).get('month', {}).get('opened', -1)) gh_prs_last_year_closed = str( gh_details.get('updated_pull_requests', {}).get('year', {}).get('closed', -1)) gh_prs_last_month_closed = str( gh_details.get('updated_pull_requests', {}).get('month', {}).get('closed', -1)) gh_issues_last_year_opened = str( gh_details.get('updated_issues', {}).get('year', {}).get('opened', -1)) gh_issues_last_month_opened = str( gh_details.get('updated_issues', {}).get('month', {}).get('opened', -1)) gh_issues_last_year_closed = str( gh_details.get('updated_issues', {}).get('year', {}).get('closed', -1)) gh_issues_last_month_closed = str( gh_details.get('updated_issues', {}).get('month', {}).get('closed', -1)) gh_forks = str(gh_details.get('forks_count', -1)) gh_refreshed_on = gh_details.get('updated_on') gh_stargazers = str(gh_details.get('stargazers_count', -1)) gh_open_issues_count = str(gh_details.get('open_issues_count', -1)) gh_subscribers_count = str(gh_details.get('subscribers_count', -1)) gh_contributors_count = str( gh_details.get('contributors_count', -1)) topics = gh_details.get('topics', []) # TODO: refactor into the separate module prp_package += "pkg.property('gh_prs_last_year_opened', {gh_prs_last_year_opened});" \ "pkg.property('gh_prs_last_month_opened', {gh_prs_last_month_opened});" \ "pkg.property('gh_prs_last_year_closed', {gh_prs_last_year_closed});" \ "pkg.property('gh_prs_last_month_closed', {gh_prs_last_month_closed});" \ "pkg.property('gh_issues_last_year_opened', " \ "{gh_issues_last_year_opened});" \ "pkg.property('gh_issues_last_month_opened', " \ "{gh_issues_last_month_opened});" \ "pkg.property('gh_issues_last_year_closed', " \ "{gh_issues_last_year_closed});" \ "pkg.property('gh_issues_last_month_closed', " \ "{gh_issues_last_month_closed});" \ "pkg.property('gh_forks', {gh_forks});" \ "pkg.property('gh_refreshed_on', '{gh_refreshed_on}');" \ "pkg.property('gh_stargazers', {gh_stargazers});" \ "pkg.property('gh_open_issues_count', {gh_open_issues_count});" \ "pkg.property('gh_subscribers_count', {gh_subscribers_count});" \ "pkg.property('gh_contributors_count', {gh_contributors_count});".format( gh_prs_last_year_opened=gh_prs_last_year_opened, gh_prs_last_month_opened=gh_prs_last_month_opened, gh_prs_last_year_closed=gh_prs_last_year_closed, gh_prs_last_month_closed=gh_prs_last_month_closed, gh_issues_last_year_opened=gh_issues_last_year_opened, gh_issues_last_month_opened=gh_issues_last_month_opened, gh_issues_last_year_closed=gh_issues_last_year_closed, gh_issues_last_month_closed=gh_issues_last_month_closed, gh_forks=gh_forks, gh_stargazers=gh_stargazers, gh_refreshed_on=gh_refreshed_on, gh_open_issues_count=gh_open_issues_count, gh_subscribers_count=gh_subscribers_count, gh_contributors_count=gh_contributors_count) # Add github topics if topics: drop_props.append('topics') str_package += " ".join([ "pkg.property('topics', '{}');".format(t) for t in topics if t ]) # Add tokens for a package if pkg_name_tokens: drop_props.append('tokens') str_package += " ".join([ "pkg.property('tokens', '{}');".format(t) for t in pkg_name_tokens if t ]) # Get Libraries.io data if 'libraries_io' in input_json.get('analyses', {}): v2 = input_json['analyses']['libraries_io'].get('schema', {}).get('version', '0-0-0') \ >= '2-0-0' details = input_json['analyses']['libraries_io'].get('details', {}) libio_dependents_projects = details.get('dependents', {}).get('count', -1) libio_dependents_repos = details.get('dependent_repositories', {}).get('count', -1) releases = details.get('releases', {}) libio_total_releases = int(releases.get('count', -1)) libio_latest_version = libio_latest_published_at = '' if libio_total_releases > 0: if v2: libio_latest = releases.get('recent', [{}])[-1] # last is latest libio_latest_published_at = libio_latest.get( 'published_at', '') libio_latest_version = libio_latest.get('number', '') else: libio_latest_published_at = releases.get('latest', {}).get( 'published_at', '') libio_latest_version = releases.get('latest', {}).get('version', '') if libio_latest_version != cur_libio_latest_ver and last_updated_flag != 'true': prp_package += "pkg.property('latest_version_last_updated', '{}');" \ .format(cur_date) if libio_latest_published_at: t = libio_latest_published_at p = parse_datetime(t).timetuple() if t else '' published_at = str(time.mktime(p)) if p else '' prp_package += "pkg.property('libio_latest_release', '{}');".format( published_at) if details.get('dependent_repositories', {}).get('top'): drop_props.append('libio_usedby') for key, val in details.get('dependent_repositories', {}).get('top', {}).items(): prp_package += "pkg.property('libio_usedby', '{key}:{val}');".format( key=key, val=val) prp_package += "pkg.property('libio_dependents_projects', " \ "'{libio_dependents_projects}');" \ "pkg.property('libio_dependents_repos', '{libio_dependents_repos}');" \ "pkg.property('libio_total_releases', '{libio_total_releases}');" \ "pkg.property('libio_latest_version', '{libio_latest_version}');".format( libio_dependents_projects=libio_dependents_projects, libio_dependents_repos=libio_dependents_repos, libio_total_releases=libio_total_releases, libio_latest_version=libio_latest_version) # Update EPV Github Release Date based on libraries_io data if v2: # 'recent' is list of {'number':n, 'published_at':p} including the latest for release in releases.get('recent', []): rel_published = release.get('published_at', '') parsed_dt = parse_datetime( rel_published).timetuple() if rel_published else '' timestamp = time.mktime(parsed_dt) if parsed_dt else '' prp_package += "g.V().has('pecosystem','{ecosystem}').has('pname'," \ "'{pkg_name}').has('version','{version}')." \ "property('gh_release_date',{gh_rel});".format( ecosystem=ecosystem, pkg_name=pkg_name, version=release.get('number', ''), gh_rel=str(timestamp)) else: if libio_latest_published_at: gh_release = time.mktime( parse_datetime(libio_latest_published_at).timetuple()) prp_package += "g.V().has('pecosystem','{ecosystem}').has('pname'," \ "'{pkg_name}')." \ "has('version','{libio_latest_version}')." \ "property('gh_release_date', {gh_rel});".format( pkg_name=pkg_name, ecosystem=ecosystem, libio_latest_version=libio_latest_version, gh_rel=str(gh_release)) for version, release in releases.get('latest', {}).get('recent', {}).items(): prp_package += "g.V().has('pecosystem','{ecosystem}').has('pname'," \ "'{pkg_name}').has('version','{version}')." \ "property('gh_release_date',{gh_rel});".format( ecosystem=ecosystem, pkg_name=pkg_name, version=version, gh_rel=str(time.mktime(parse_datetime(release).timetuple()))) # Refresh the properties whereever applicable if len(drop_props) > 0: drop_prop += "g.V().has('ecosystem','{ecosystem}').has('name'," \ "'{pkg_name}').properties('{p}').drop().iterate();".format( ecosystem=ecosystem, pkg_name=pkg_name, p="','".join(drop_props)) return drop_prop + str_package, prp_package
def from_iso8601(value): return parse_datetime(value)
def execute(self, ecosystem, bucket_name, object_key, from_date=None, to_date=None): """Aggregate gathered topics and store them on S3. :param ecosystem: ecosystem name for which topics should be gathered :param bucket_name: name of the destination bucket to which topics should be stored :param object_key: name of the object under which aggregated topics should be stored :param from_date: date limitation for task result queries :param to_date: date limitation for taks result queries """ if from_date is not None: from_date = parse_datetime(from_date) if to_date is not None: to_date = parse_datetime(to_date) s3 = StoragePool.get_connected_storage('S3Data') # TODO: this will need to be changed once we will introduce package level flows postgres = StoragePool.get_connected_storage('BayesianPostgres') base_query = postgres.session.query(WorkerResult).\ join(Analysis).\ filter(WorkerResult.error.is_(False)).\ filter(WorkerResult.worker == 'github_details') if from_date is not None: base_query = base_query.filter(Analysis.started_at > from_date).\ order_by(desc(WorkerResult.id)) if to_date is not None: base_query = base_query.filter(Analysis.started_at < to_date).\ order_by(desc(WorkerResult.id)) start = 0 topics = [] while True: results = base_query.slice(start, start + 10).all() if not results: break self.log.info("Collecting topics, slice offset is %s", start) start += 10 for entry in results: name = entry.package.name version = entry.package.version task_result = entry.task_result if not postgres.is_real_task_result(task_result): task_result = s3.retrieve_task_result(ecosystem, name, version, 'github_details') topics.append({ 'topics': task_result.get('details', {}).get('topics'), 'name': name, 'ecosystem': ecosystem, 'version': version }) report = { 'ecosystem': ecosystem, 'bucket_name': bucket_name, 'object_key': object_key, 'from_date': str(from_date), 'to_date': str(to_date), 'result': topics } self._store_topics(bucket_name, object_key, report)
def __get_tag_key_and_top_values( self, project_id, group_id, environment_id, key, limit=3, raise_on_empty=True, **kwargs ): tag = u"tags[{}]".format(key) filters = {"project_id": get_project_list(project_id)} if environment_id: filters["environment"] = [environment_id] if group_id is not None: filters["group_id"] = [group_id] conditions = kwargs.get("conditions", []) aggregations = kwargs.get("aggregations", []) conditions.append([tag, "!=", ""]) aggregations += [ ["uniq", tag, "values_seen"], ["count()", "", "count"], ["min", SEEN_COLUMN, "first_seen"], ["max", SEEN_COLUMN, "last_seen"], ] result, totals = snuba.query( start=kwargs.get("start"), end=kwargs.get("end"), groupby=[tag], conditions=conditions, filter_keys=filters, aggregations=aggregations, orderby="-count", limit=limit, totals=True, referrer="tagstore.__get_tag_key_and_top_values", ) if raise_on_empty and (not result or totals.get("count", 0) == 0): raise TagKeyNotFound if group_id is None else GroupTagKeyNotFound else: if group_id is None: key_ctor = TagKey value_ctor = TagValue else: key_ctor = functools.partial(GroupTagKey, group_id=group_id) value_ctor = functools.partial(GroupTagValue, group_id=group_id) top_values = [ value_ctor( key=key, value=value, times_seen=data["count"], first_seen=parse_datetime(data["first_seen"]), last_seen=parse_datetime(data["last_seen"]), ) for value, data in six.iteritems(result) ] return key_ctor( key=key, values_seen=totals.get("values_seen", 0), count=totals.get("count", 0), top_values=top_values, )
def get_group_tag_keys_and_top_values( self, project_id, group_id, environment_id, user=None, keys=None, value_limit=TOP_VALUES_DEFAULT_LIMIT): # Similar to __get_tag_key_and_top_values except we get the top values # for all the keys provided. value_limit in this case means the number # of top values for each key, so the total rows returned should be # num_keys * limit. start, end = self.get_time_range() # First get totals and unique counts by key. keys_with_counts = self.get_group_tag_keys(project_id, group_id, environment_id, keys=keys) # Then get the top values with first_seen/last_seen/count for each filters = { 'project_id': [project_id], } if environment_id: filters['environment'] = [environment_id] if keys is not None: filters['tags_key'] = keys if group_id is not None: filters['issue'] = [group_id] aggregations = [ ['count()', '', 'count'], ['min', SEEN_COLUMN, 'first_seen'], ['max', SEEN_COLUMN, 'last_seen'], ] values_by_key = snuba.query( start, end, ['tags_key', 'tags_value'], None, filters, aggregations, orderby='-count', limitby=[value_limit, 'tags_key'], referrer='tagstore.__get_tag_keys_and_top_values') # Then supplement the key objects with the top values for each. if group_id is None: value_ctor = TagValue else: value_ctor = functools.partial(GroupTagValue, group_id=group_id) for keyobj in keys_with_counts: key = keyobj.key values = values_by_key.get(key, []) keyobj.top_values = [ value_ctor( key=keyobj.key, value=value, times_seen=data['count'], first_seen=parse_datetime(data['first_seen']), last_seen=parse_datetime(data['last_seen']), ) for value, data in six.iteritems(values) ] return keys_with_counts
def decode(self, string): return parse_datetime(string)
def gitstats_per_user(path, recursive=False, since=None, until=None, authors_emails={}, use_paths=None, filterby_emails=None): """ :param str path: Path to analyse. If the recursive is false, it should be the path o a repository. If the recursive is true, it should be the parent path of several repositories :param bool recursive: Indicate if it should read the stats from the path directory or the subdirectories in the path. :return: A dictionary with the format { 'author': [{'date':..., 'files updated':..., 'insertions':..., 'deletions':...}], ... } """ if use_paths is None: directories = [path] if not recursive else find_gitrepos(path) else: directories = use_paths authors = {} emails = {} for directory in directories: print(directory) command = ["git", "log", "--shortstat", "--all"] if since: command += ['--since', since.strftime('%Y.%m.%d')] if until: command += ['--until', until.strftime('%Y.%m.%d')] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=directory) output, _ = p.communicate() output = output.decode() for commit in output.split('commit ')[1:]: lines = commit.split('\n') deletions = 0 insertions = 0 files_changed = 0 for row in lines: if row.startswith('Author:'): author = row[8:] author = parse("{} <{}>", author) author, email = author.fixed emails[author] = email if email in authors_emails: author = authors_emails[email] elif row.startswith('Date:'): date = parse_datetime(row[8:]) elif ' changed' in row: data = row.strip().split(',') for v in data: v = v.strip() res = parse("{:d} deletions(-)", v) if res: deletions = res.fixed[0] else: res = parse("{:d} deletion(-)", v) if res: deletions = res.fixed[0] res = parse("{:d} files changed", v) if res: files_changed = res.fixed[0] else: res = parse("{:d} file changed", v) if res: files_changed = res.fixed[0] res = parse("{:d} insertions(+)", v) if res: insertions = res.fixed[0] else: res = parse("{:d} insertion(+)", v) if res: insertions = res.fixed[0] if filterby_emails and email not in filterby_emails: continue if author not in authors: authors[author] = [] authors[author].append({ 'date': date, 'files changed': files_changed, 'deletions': deletions, 'insertions': insertions }) for author, commits in authors.items(): authors[author] = sorted(commits, key=lambda x: x['date']) return authors, directories, emails
def run(self, input_data, modify_light_metadata): """ # Runs the stats generation phase # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column # A lot of information about the data distribution and quality will also be logged to the server in this phase """ header = input_data.columns non_null_data = {} all_sampled_data = {} for column in header: non_null_data[column] = [] all_sampled_data[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(input_data.data_array) if population_size < 50: sample_size = population_size else: sample_size = int( calculate_sample_size( population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) if sample_size > 3000 and sample_size > population_size / 8: sample_size = min(round(population_size / 8), 3000) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.log.info( 'population_size={population_size}, sample_size={sample_size} {percent:.2f}%' .format(population_size=population_size, sample_size=sample_size, percent=(sample_size / population_size) * 100)) for sample_i in input_data_sample_indexes: row = input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = cast_string_to_python_type(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: non_null_data[column].append(value) all_sampled_data[column].append(value) column_count[column] += 1 stats = {} col_data_dict = {} for i, col_name in enumerate(non_null_data): col_data = non_null_data[col_name] # all rows in just one column full_col_data = all_sampled_data[col_name] data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info, column_status = self._get_column_data_type( col_data, i, input_data.data_array, col_name) if column_status == 'Column empty': if modify_light_metadata: self.transaction.lmd['malformed_columns']['names'].append( col_name) self.transaction.lmd['malformed_columns'][ 'indices'].append(i) continue if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [ str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null' ]: col_data[i] = None else: try: col_data[i] = int( parse_datetime(element).timestamp()) except: self.log.warning( 'Could not convert string to date and it was expected, current value {value}' .format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [ clean_float(i) for i in newData if str(i) not in [ '', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null' ] ] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.1 initial_step_size = abs(max_value - min_value) / 100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i - min_value) * inc_rate i = i + i_inc else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max( [1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } elif data_type == DATA_TYPES.CATEGORICAL: all_values = [] for row in input_data.data_array: all_values.append(row[i]) histogram = Counter(all_values) all_possible_values = histogram.keys() col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "histogram": { "x": list(histogram.keys()), "y": list(histogram.values()) } #"percentage_buckets": list(histogram.keys()) } # @TODO This is probably wrong, look into it a bit later else: # see if its a sentence or a word is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False dictionary, histogram = self._get_words_dictionary( col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len(dictionary) / len( col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len( col_data) > 50 and is_full_text == False: dictionary = [] dictionary_available = False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "histogram": histogram } stats[col_name] = col_stats stats[col_name]['data_type_dist'] = data_type_dist stats[col_name]['data_subtype_dist'] = data_subtype_dist stats[col_name]['column'] = col_name stats[col_name]['empty_cells'] = empty_count[col_name] stats[col_name]['empty_percentage'] = empty_count[ col_name] * 100 / column_count[col_name] if 'separator' in additional_info: stats[col_name]['separator'] = additional_info['separator'] col_data_dict[col_name] = col_data for i, col_name in enumerate(all_sampled_data): if col_name in self.transaction.lmd['malformed_columns']['names']: continue stats[col_name].update( self._compute_duplicates_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_empty_cells_score(stats, all_sampled_data, col_name)) #stats[col_name].update(self._compute_clf_based_correlation_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_data_type_dist_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_z_score(stats, col_data_dict, col_name)) stats[col_name].update( self._compute_lof_score(stats, col_data_dict, col_name)) stats[col_name].update( self._compute_similariy_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_value_distribution_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_consistency_score(stats, col_name)) stats[col_name].update( self._compute_redundancy_score(stats, col_name)) stats[col_name].update( self._compute_variability_score(stats, col_name)) stats[col_name].update( self._compute_data_quality_score(stats, col_name)) total_rows = len(input_data.data_array) if modify_light_metadata: self.transaction.lmd['column_stats'] = stats self.transaction.lmd['data_preparation'][ 'total_row_count'] = total_rows self.transaction.lmd['data_preparation']['test_row_count'] = len( input_data.test_indexes) self.transaction.lmd['data_preparation']['train_row_count'] = len( input_data.train_indexes) self.transaction.lmd['data_preparation'][ 'validation_row_count'] = len(input_data.validation_indexes) self._log_interesting_stats(stats) return stats
def _handle_tweet(url): """http*://twitter.com/*/statuses/*""" tweet_url = "https://api.twitter.com/1.1/statuses/show.json?id=%s&include_entities=false&tweet_mode=extended" test = re.match(r"https?://.*?twitter\.com\/(\w+)/status(es)?/(\d+)", url) if not test: return # matches for unique tweet id string infourl = tweet_url % test.group(3) bearer_token = config.get("twitter_bearer") if not bearer_token: log.info( "Use util/twitter_application_auth.py to request a bearer token for tweet handling" ) return _parse_tweet_from_src(url) headers = {'Authorization': 'Bearer ' + bearer_token} data = bot.get_url(infourl, headers=headers) if not data: log.warning("Empty response from Twitter api") return tweet = data.json() if 'errors' in tweet: for error in tweet['errors']: log.warning("Error reading tweet (code %s) %s" % (error['code'], error['message'])) return text = tweet['full_text'].strip() user = tweet['user']['screen_name'] name = tweet['user']['name'].strip() verified = tweet['user']['verified'] retweets = tweet['retweet_count'] favorites = tweet['favorite_count'] created_date = parse_datetime(tweet['created_at']) def twit_timestr(dt): """A coarse timestr function""" months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] diff = datetime.now(tzutc()) - dt if diff.days > 30 * 6: return "%i %s %i" % (dt.day, months[dt.month - 1], dt.year) elif diff.days > 30: return "%i %s" % (dt.day, months[dt.month - 1]) elif diff.days: return "%id" % diff.days elif diff.seconds > 3600: return "%ih" % (diff.seconds / 3600) elif diff.seconds > 60: return "%im" % (diff.seconds / 60) else: return "now" user = "******".format(user) if verified: user = "******".format(user) tweet = "{0} ({1}) {2}: {3} [♻ {4} ♥ {5}]".format( name, user, twit_timestr(created_date), text, retweets, favorites) return tweet
def share_daily_price(request, start_date, end_date=None): api_confirm_data = __validate_api_key(request) if api_confirm_data: return api_confirm_data # check for setting to do scraping ! if settings.MY_SETTINGS.get("use_scraper"): errors: list = [] # try to scrap data, validate date ! try: from_date: str = str(parse_datetime(start_date).date()) if end_date: to_date: str = str(parse_datetime(end_date).date()) # api call for grabbing data for range of dates ! share_data = nepse_web_scraper.get_nepse_data( start_date=from_date, end_date=to_date) else: # api call for grabbing data for single date ! share_data = nepse_web_scraper.get_nepse_data_for_date( date=from_date) # return json ! return JsonResponse(share_data, encoder="utf-8") except Exception as e: # handle error ! print(e) errors.append("Invalid start or end date") return JsonResponse({"errors": errors}) else: # grab data from db and render ! from_date: str = str(parse_datetime(start_date).date()) company_transaction_cache = share_manager_models.ShareCompanyDetail.objects.all( ).select_related("company_name") errors: list = [] try: if end_date: try: to_date: str = str(parse_datetime(end_date).date()) company_transaction = company_transaction_cache.filter( Q(company_transaction_date__gte=from_date) & Q(company_transaction_date__lte=to_date)) except Exception as e: print(e) errors.append("invalid date") return JsonResponse({"errors": errors}) else: company_transaction = company_transaction_cache.filter( company_transaction_date=from_date) company_transaction_serializer = share_manager_serializers.ShareCompanyDetailSerializer( company_transaction, many=True, ) return JsonResponse({"data": company_transaction_serializer.data}) except Exception as e: print(e) errors.append( "value has an invalid date format. It must be in YYYY-MM-DD format." ) return JsonResponse({"errors": errors})