def generic_results_processor(ptype, frame, rows): if 'path-map-file' in frame: mapping = parse_map_file(frame) path_processor = partial(process_mapped_path, mapping) elif 'prefix' in frame: path_processor = partial(process_prefixed_path, frame['prefix']) elif 'path-map' in frame: path_processor = partial(process_mapped_path, frame['path-map']) ensure(path_processor, "generic results processing requires a 'prefix' or 'path-map' key.") def _process(row): try: path, datestr, count = row identifier = path_processor(path) if identifier is None: return # raise ValueError? return { 'views': int(count), 'date': _str2dt(datestr), 'identifier': identifier, } except ValueError as err: LOG.info("skipping row, bad value: %s" % str(err)) except BaseException as err: LOG.exception("unhandled exception processing row: %s", str(err), extra={"row": row}) return list(filter(None, map(_process, rows)))
def process_prefixed_path(prefix, path): path = normalise_path(path) ensure(path.startswith(prefix), "path does not start with given prefix (%r): %s" % (prefix, path), ValueError) # we could just dispense with the prefix and discard the first segment ... prefix_len = len(prefix) path = path[prefix_len:].strip().strip('/') # /events/foobar => foobar identifier = path.split('/', 1)[0] # foobar/the-baz-in-bar-fooed-at-the-star => foobar return identifier
def writefile(xid, content, fname): path = join(settings.DUMP_PATH, xid) utils.ensure(utils.mkdirs(path), "failed to create path %s" % path) path = join( path, fname) # ll: /tmp/elife-metrics/pmc-asdfasdfasdf-482309849230/log if isinstance(content, str): content = content.encode('utf8') open(path, 'wb').write(content) return path
def _parse_line(line): "the file is a simple 'cat nginx-redirect-file | grep prefix > outfile'" line = line.strip() if not line: return path, redirect = line.split("' '") path, redirect = path.strip(" '"), redirect.strip(" ';") prefix = frame['redirect-prefix'] ensure(redirect.startswith(prefix), "redirect doesn't start with redirect-prefix: %s" % line) # /inside-elife/foobar => foobar bits = redirect.strip('/').split('/', 1) # '/inside-elife/foobar' -> 'inside-elife/foobar' -> ['inside-elife, 'foobar'] redirect = models.LANDING_PAGE if len(bits) == 1 else bits[1] return (path, redirect)
def generic_query_processor(ptype, frame): # NOTE: ptype is unused, it's just to match a query processor function's signature ptype_filter = None if frame.get('pattern'): ptype_filter = frame['pattern'] elif frame.get('prefix') and frame.get('path-list'): ptype_filter = generic_ga_filter_w_paths(frame['prefix'], frame['path-list']) elif frame.get('prefix'): ptype_filter = generic_ga_filter(frame['prefix']) elif frame.get('path-map'): ptype_filter = generic_ga_filter_w_paths('', frame['path-map'].keys()) ensure(ptype_filter, "bad frame data") return ptype_filter
def fetch(pmcid_list): ensure( len(pmcid_list) <= MAX_PER_PAGE, "no more than %s results can be processed per-request. requested: %s" % (MAX_PER_PAGE, len(pmcid_list))) headers = {'accept': 'application/json'} params = { 'dbfrom': 'pubmed', 'linkname': 'pmc_pmc_citedby', 'id': lmap(norm_pmcid, pmcid_list), 'tool': 'elife-metrics', 'email': settings.CONTACT_EMAIL, 'retmode': 'json' } return handler.requests_get(PM_URL, params=params, headers=headers)
def load_fn(dotted_path): try: dotted_path = dotted_path.strip().lower().replace('-', '_') # basic path normalisation package, funcname = dotted_path.rsplit('.', 1) # 'os.path.join' => 'os.path', 'join' package = importlib.import_module(package) ensure(hasattr(package, funcname), "could not find function %r in package %r for given path: %s" % (funcname, package, dotted_path)) return getattr(package, funcname) except ImportError as err: # package doesn't exist LOG.debug(str(err)) except AssertionError as err: # package exists but not function LOG.debug(str(err)) return None
def query_ga(ptype, query, results_pp=MAX_GA_RESULTS, replace_cache_files=False): ensure(is_inrange(results_pp, 1, MAX_GA_RESULTS), "`results_pp` must be an integer between 1 and %s" % MAX_GA_RESULTS) sd, ed = query['start_date'], query['end_date'] LOG.info("querying GA for %ss between %s and %s" % (ptype, sd, ed)) dump_path = ga_core.output_path(ptype, sd, ed) # TODO: this settings.TESTING check is a code smell. if os.path.exists(dump_path) and not settings.TESTING: if not replace_cache_files: LOG.info("(cache hit)") return json.load(open(dump_path, 'r')) # cache file will be replaced with results pass query['max_results'] = results_pp query['start_index'] = 1 response = ga_core.query_ga(query) if not settings.TESTING: ga_core.write_results(response, dump_path) return response
def update_article(row): data = { 'doi': row['DOI'], 'pmcid': row['PMCID'], 'pmid': row['PMID'] or None, } ensure(data['doi'].startswith(settings.DOI_PREFIX), "refusing to create/update non-journal article: %s" % row) if not data['pmid']: LOG.warn("no pmid for %s" % data['doi']) # the doi values in the csv data look perfect and I've never had a problem with them # however # we only do it once per new production machine and it doesn't hurt to check utils.doi2msid(data['doi'], allow_subresource=False) return create_or_update(models.Article, data, ['doi'], create=True, update=True, update_check=True)
def parse_entry(entry): "parses a single search result from scopus" try: citedby_link = first(lfilter(lambda d: d["@ref"] == "scopus-citedby", entry['link'])) ensure('prism:doi' in entry, "entry is missing 'doi'!", ParseError) ensure('citedby-count' in entry, "entry is missing 'citedby-count'!", ParseError) ensure(isint(entry['citedby-count']), "citedby count isn't an integer", ParseError) if isinstance(entry['prism:doi'], list): weird_key = "$" for struct in entry['prism:doi']: doi = struct[weird_key] if utils.doi2msid(doi, safe=True, allow_subresource=False): entry['prism:doi'] = doi break utils.doi2msid(entry['prism:doi'], allow_subresource=False) # throws AssertionError return { 'doi': entry['prism:doi'], 'num': int(entry['citedby-count']), 'source': models.SCOPUS, 'source_id': citedby_link['@href'] } # errors handled here won't be caught by handler.capture_parse_error except AssertionError: LOG.warn("discarding scopus citation: failed to parse doi", extra={'response': entry}) return {'bad': entry} except ParseError: LOG.warn("discarding scopus citation: failed to parse entry", extra={'response': entry}) return {'bad': entry}
def build_ga_query(ptype, start_date=None, end_date=None, history_data=None): """As we go further back in history the query will change as known epochs overlap. These overlaps will truncate the current period to the epoch boundaries.""" ensure(is_ptype(ptype), "bad page type") # if dates given, ensure they are date objects start_date and ensure(is_date(start_date), "bad start date") end_date and ensure(is_date(end_date), "bad end date") # if history data provided, ensure it validates if history_data: history_data = history.type_object.validate(history_data) # extract just the page type we're after ptype_history = history_data or history.ptype_history(ptype) frame_list = ptype_history['frames'] # frames are ordered oldest to newest (asc) earliest_date = frame_list[0]['starts'] latest_date = frame_list[-1]['ends'] start_date = start_date or earliest_date end_date = end_date or latest_date ensure(start_date <= end_date, "start date %r cannot be greater than end date %r" % (start_date, end_date)) # only those frames that overlap our start/end dates frame_list = interesting_frames(start_date, end_date, frame_list) # each timeframe requires it's own pattern generation, post processing and normalisation query_list = [(frame, build_ga_query__queries_for_frame(ptype, frame, start_date, end_date)) for frame in frame_list] return query_list
def _fetch_pmids(doi): # article doesn't have a pmcid for whatever reason # go fetch one using doi # https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/ LOG.info("fetching pmcid for doi %s" % doi) params = { 'ids': doi, 'tool': 'elife-metrics', 'email': settings.CONTACT_EMAIL, 'format': 'json', } resp = requests.get(PMID_URL, params=params) resp.raise_for_status() data = resp.json() # ll: # { # "status": "ok", # "responseDate": "2017-01-31 13:35:10", # "request": "ids=10.7554%2FeLife.09560;format=json", # "records": [ # { # "pmcid": "PMC4559886", # "pmid": "26354291", # "doi": "10.7554/eLife.09560", # "versions": [ # { # "pmcid": "PMC4559886.1", # "current": "true" # } # ] # } # ] # } ensure(data['status'] == 'ok', "response is not ok! %s" % data) return subdict(data['records'][0], ['pmid', 'pmcid'])
def page_views(pid, ptype, period=DAY): ensure(is_pid(pid), "bad page identifier", ValueError) ensure(is_ptype(ptype), "bad page type", ValueError) ensure(is_period(period), "bad period", ValueError) try: pobj = models.Page.objects.get(identifier=pid, type=ptype) dispatch = { DAY: daily_page_views, MONTH: monthly_page_views } return dispatch[period](pobj) except models.Page.DoesNotExist: return None
def parse_map_file(frame, contents=None): contents and ensure(isinstance(contents, str), "'contents' must be a string'") def _parse_line(line): "the file is a simple 'cat nginx-redirect-file | grep prefix > outfile'" line = line.strip() if not line: return path, redirect = line.split("' '") path, redirect = path.strip(" '"), redirect.strip(" ';") prefix = frame['redirect-prefix'] ensure(redirect.startswith(prefix), "redirect doesn't start with redirect-prefix: %s" % line) # /inside-elife/foobar => foobar bits = redirect.strip('/').split('/', 1) # '/inside-elife/foobar' -> 'inside-elife/foobar' -> ['inside-elife, 'foobar'] redirect = models.LANDING_PAGE if len(bits) == 1 else bits[1] return (path, redirect) if contents: contents = contents.splitlines() else: path = os.path.join(settings.GA_PTYPE_SCHEMA_PATH, frame['path-map-file']) contents = open(path, 'r').readlines() return OrderedDict(lfilter(None, lmap(_parse_line, contents)))
def ptype_history(ptype, history=None): history = history or load_from_file() ensure(ptype in history, "no historical data found: %s" % ptype, ValueError) return history[ptype]
def enplumpen(artid): "takes an article id like e01234 and returns a DOI like 10.7554/eLife.01234" if isint(artid): return msid2doi(artid) ensure(artid[0] == 'e', 'cannot convert article id %s to doi' % artid) return artid.replace('e', '10.7554/eLife.')