def fetch_issues(state, since): """Fetch webcompat issues from Github.""" GITHUB_OWNER = settings.GITHUB_OWNER GITHUB_REPO = settings.GITHUB_REPO g = Github(settings.GITHUB_API_TOKEN) org = g.get_organization(GITHUB_OWNER) repo = org.get_repo(GITHUB_REPO) kwargs = {"state": state} # Get last updated timestamp last_updated_timestamp = get_last_updated_timestamp() if since or last_updated_timestamp: kwargs["since"] = dateutilparse(since or last_updated_timestamp) issues = repo.get_issues(**kwargs) es = Elasticsearch([settings.ES_URL], **settings.ES_KWARGS) es.indices.create(index=settings.ES_WEBCOMPAT_INDEX, ignore=400) for i in issues: try: click.echo("Fetching issue: {}".format(i.id)) # Prepare ES document object body = i.raw_data headers = { "Authorization": "token {}".format(settings.GITHUB_API_TOKEN) } response = requests.get(body["events_url"], headers=headers) response.raise_for_status() events_raw = response.json() # Query issue title and body to extract domains domains = set() domains.update(re.findall(FQDN_REGEX, i.title)) domains.update(re.findall(FQDN_REGEX, i.body)) body.update({"events": events_raw}) body.update({"domains": list(domains)}) body.update({"valid_domains": get_valid_domains(list(domains))}) body.update({"parsed_url": get_parsed_url(i.body)}) body.update(get_extracted_fields(i.body)) es.index( index=settings.ES_WEBCOMPAT_INDEX, doc_type="webcompat_issue", id=i.number, body=body, ) except Exception as e: click.echo(str(e), err=True) continue
def checkDomainAge(addr): try: registration = whois.whois(addr) created = registration.creation_date if created is not None: if isinstance(created, list): created = created[0] if isinstance(created, str): if created.startswith("before "): created = created[7:] created = dateutilparse(created) if isinstance(created, datetime.date): age = (datetime.datetime.now() - created).days if age > 180: return "older" except whois.parser.PywhoisError: pass return "< 6 month"
def cast_val(value, directive): # try: if directive == "Integer": if value.lower() == "false": return 0 elif value.lower() == "true": return 1 else: try: return int(value) except ValueError: return None elif directive == "Float": try: return float(re.sub("[^0-9\.\-]", "", value)) except ValueError: return float(value) elif directive == "Time": if len(value) == 10 and sum(c.isdigit() for c in value) == 10: return int( time.mktime( datetime.datetime.fromtimestamp(int(value)).timetuple())) elif len(value) == 13 and sum(c.isdigit() for c in value) == 13: return int( time.mktime( datetime.datetime.fromtimestamp(int(value) / 1000).timetuple())) else: return int( time.mktime(dateutilparse(value, fuzzy=True).timetuple())) elif directive == "Text" or directive == "Phrase": return [ PorterStemmer().stem(word) for word in clean_str(value).split(" ") ] elif directive == "Categorical": return value
metadata = {} metadata["timestamp"] = 0 with open(os.path.join(e[0], f), 'r') as fd: metadata['relpath'] = os.path.relpath(os.path.join(e[0], f), cur_dir) for line in reversed(fd.readlines()): if line.count(end_sigil) > 0: break if line.count(':') == 0: continue key, content = line.split(':', 1) if key == 'Tags': metadata[key] = [t.strip() for t in content.split(',')] elif key in known_md: metadata[key] = content.strip() if "Date" in metadata.keys(): ts = dateutilparse(metadata["Date"]).timestamp() print("Parsed Date: {}".format(ts)) metadata["timestamp"] = ts blogs.append(metadata) def tags_to_s(tags): if not tags: tags = ['untagged'] return "".join(["'", "', '".join(tags), "'"]) def bloglink(b): return "[{}]({})".format(b["Title"], b["relpath"])
def _parse_entries(hardict): """ Parse all entries, grouping those that have a page reference into page blocks. This will also handle requests occurring outside the scope of a page, retaining the original call timing. Args: hardict (dict): HAR dictionary Returns: luastr (str): Lua code snippet with page """ # make sure pages and events are sorted in start-order since this # is not guaranteed by the HAR standard. This also hadles events # that occur outside of the context of a page. pages = hardict["log"].get("pages", []) data = sorted(pages + hardict["log"]["entries"], key=lambda dat: dateutilparse(dat["startedDateTime"])) # we want to save the time-deltas until the *next* entry in the list if len(data) > 1: dtimes = [dateutilparse(dat["startedDateTime"]) for dat in data] dtimes = [dtimes[i + 1] - dtimes[i] for i in range(len(dtimes) - 1)] else: dtimes = [0] # we make a page->event mapping for quick lookup entries = defaultdict(list) for entry in sorted(hardict["log"]["entries"], key=lambda ev: dateutilparse(ev["startedDateTime"])): entries[entry.get("pageref", None)].append(entry) lua = [] for idat, dat in enumerate(data): comment = dat.get("comment", "") if "id" in dat: # a page. title = dat["title"] pageref = dat["id"] comment = "%s (HAR pageref '%s')%s" % ( title, pageref, " (Comment: %s)" % comment if comment else "") body = [] entry_time = 0 for entrydict in entries[pageref]: body.append(_get_entry(entrydict, batch=True)) entry_time += entrydict["time"] if entrydict["time"] > 0 else 0 if body: lua.append( _LUA_PAGE.safe_substitute(comment=comment, pageref=pageref, body=",\n\n".join(body))) dtime = dtimes[idat] if dtime.microseconds > 0: # we should sleep before triggering the next page in # order to best emulate the user case we recorded. But # since the batch-requests block we must remove the # time that has already passed from page load. onload = dat["pageTimings"].get("onLoad", -1) onload = onload if onload and onload >= 0 else 0 comment = dat["pageTimings"].get("comment", "") # Note - setting 10 ms as minimum sleep and assuming # entry time and onload time are independent of each # other. sleeptime = max(10, dtime.microseconds - entry_time - onload) lua.append([ "-- pause until next page%s." % ((" (Comment: %s)" % comment) if comment else ""), "client.sleep(%s, 1000)" % sleeptime ]) elif "pageref" not in dat: # an entry outside the scope of a page comment = "Request outside page%s" % (" (Comment: %s" % comment if comment else "") lua.append( _LUA_SINGLE.safe_substitute(comment=comment, body=_get_entry(dat))) return "\n".join(lua)
"-o", "--older", help= "Abort those PV's whose workflow started more that this many days ago. To abort all PV's, specify 0.", default=7, type=int) args = parser.parse_args() if not args.url.endswith('bpl'): print( "The URL needs to point to the mgmt bpl; for example, http://arch.slac.stanford.edu/mgmt/bpl. ", args.url) sys.exit(1) neverConnectedPVs = requests.get(args.url + '/getNeverConnectedPVs').json() for neverConnectedPV in neverConnectedPVs: abort = False if args.older == 0: abort = True elif "startOfWorkflow" in neverConnectedPV: startOfWorkflow = dateutilparse( neverConnectedPV["startOfWorkflow"]) if (datetime.datetime.now(tzlocal()) - startOfWorkflow).total_seconds() >= (args.older * 86400): abort = True if abort: print("Aborting PV %s " % neverConnectedPV['pvName']) aresp = requests.get(args.url + '/abortArchivingPV', params={"pv": neverConnectedPV['pvName']}) aresp.raise_for_status() time.sleep(0.25)