def import_products(provider_name, import_input): if provider_name in ["bibtex", "product_id_strings"]: logger.debug(u"in import_products with provider_name {provider_name}".format( provider_name=provider_name)) else: logger.debug(u"in import_products with provider_name {provider_name}: {import_input}".format( provider_name=provider_name, import_input=import_input)) aliases = [] # pull in standard items, if we were passed any of these if provider_name=="product_id_strings": aliases = get_aliases_from_product_id_strings(import_input) elif provider_name=="bibtex": provider_module = ProviderFactory.get_provider("bibtex") aliases = provider_module.member_items(import_input) else: try: provider_module = ProviderFactory.get_provider(provider_name) aliases = provider_module.member_items(import_input) except ImportError: logger.debug(u"in import_products, got ImportError") pass # logger.debug(u"returning from import_products with aliases {aliases}".format( # aliases=aliases)) return(aliases)
def setUp(self): self.config = None #placeholder self.TEST_PROVIDER_CONFIG = [ ("wikipedia", {}) ] self.d = None # do the same thing for the redis db, set up the test redis database. We're using DB Number 8 self.r = tiredis.from_url("redis://localhost:6379", db=8) self.r.flushdb() provider_queues = {} providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) for provider in providers: provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue") self.b = backend.Backend( backend.RedisQueue("alias-unittest", self.r), provider_queues, [backend.PythonQueue("couch_queue")], self.r) self.fake_item = { "_id": "1", "type": "item", "num_providers_still_updating":1, "aliases":{"pmid":["111"]}, "biblio": {}, "metrics": {}, "last_modified": datetime.datetime(2013, 1, 1) } self.fake_aliases_dict = {"pmid":["222"]} self.tiid = "abcd" self.db = setup_postgres_for_unittests(db, app)
def provider_method_wrapper(tiid, provider, method_name): # logger.info(u"{:20}: in provider_method_wrapper with {tiid} {provider_name} {method_name} with {aliases}".format( # "wrapper", tiid=tiid, provider_name=provider.provider_name, method_name=method_name, aliases=input_aliases_dict)) product = Product.query.get(tiid) if not product: logger.warning(u"Empty product in provider_run for tiid {tiid}".format( tiid=tiid)) return None input_alias_tuples = product.aliases_for_providers try: method = getattr(provider, method_name) except AttributeError: provider = ProviderFactory.get_provider(provider) method = getattr(provider, method_name) provider_name = provider.provider_name worker_name = provider_name+"_worker" try: method_response = method(input_alias_tuples) except ProviderError, e: method_response = None logger.info(u"{:20}: **ProviderError {tiid} {method_name} {provider_name}, Exception type {exception_type} {exception_arguments}".format( worker_name, tiid=tiid, provider_name=provider_name.upper(), method_name=method_name.upper(), exception_type=type(e).__name__, exception_arguments=e.args))
def test_get_providers_filters_by_aliases(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "aliases") provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Pubmed', 'Mendeley']))
def test_get_providers(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
def setUp(self): self.config = None #placeholder self.TEST_PROVIDER_CONFIG = [("wikipedia", {})] self.d = None # do the same thing for the redis db, set up the test redis database. We're using DB Number 8 self.r = tiredis.from_url("redis://localhost:6379", db=REDIS_UNITTEST_DATABASE_NUMBER) self.r.flushdb() provider_queues = {} providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) for provider in providers: provider_queues[provider.provider_name] = backend.PythonQueue( provider.provider_name + "_queue") self.b = backend.Backend(backend.RedisQueue("alias-unittest", self.r), provider_queues, [backend.PythonQueue("couch_queue")], self.r) self.fake_item = { "_id": "1", "type": "item", "num_providers_still_updating": 1, "aliases": { "pmid": ["111"] }, "biblio": {}, "metrics": {}, "last_modified": datetime.datetime(2013, 1, 1) } self.fake_aliases_dict = {"pmid": ["222"]} self.tiid = "abcd" self.db = setup_postgres_for_unittests(db, app)
def importer_post(provider_name): """ Gets aliases associated with a query from a given provider. """ input_string = request.json["input"] if provider_name == "pmids": provider_name = "pubmed" elif provider_name == "dois": provider_name = "crossref" elif provider_name == "urls": provider_name = "webpage" try: provider = ProviderFactory.get_provider(provider_name) except ImportError: abort_custom(404, "an importer for provider '{provider_name}' is not found".format( provider_name=provider_name)) try: aliases = provider.member_items(input_string) except ProviderItemNotFoundError: abort_custom(404, "item not found") except (ProviderTimeout, ProviderServerError): abort_custom(503, "timeout error, might be transient") except ProviderError: abort(500, "internal error from provider") tiids_aliases_map = item_module.create_tiids_from_aliases(aliases, myredis) logger.debug(u"in provider_importer_get with {tiids_aliases_map}".format( tiids_aliases_map=tiids_aliases_map)) products_dict = format_into_products_dict(tiids_aliases_map) resp = make_response(json.dumps({"products": products_dict}, sort_keys=True, indent=4), 200) return resp
def prep_collection_items(aliases): logger.info("got a list of aliases; creating new items for them.") try: # remove unprintable characters and change list to tuples clean_aliases = [(clean_id(namespace), clean_id(nid)) for [namespace, nid] in aliases] except ValueError: logger.error( "bad input to POST /collection (requires [namespace, id] pairs):{input}".format(input=str(clean_aliases)) ) abort(404, "POST /collection requires a list of [namespace, id] pairs.") logger.debug( "POST /collection got list of aliases; creating new items for {aliases}".format(aliases=str(clean_aliases)) ) (tiids, items) = create_or_find_items_from_aliases(clean_aliases) logger.debug("POST /collection saving a group of {num} new items: {items}".format(num=len(items), items=str(items))) # batch upload the new docs to the db # make sure they are there before the provider updates start happening for doc in mydao.db.update(items): pass # for each item, set the number of providers that need to run before the update is done # and put them on the update queue for item in items: myredis.set_num_providers_left( item["_id"], ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS) ) myredis.add_to_alias_queue(item["_id"], item["aliases"]) return tiids
def provider_memberitems(provider_name): """ Starts a memberitems update for a specified provider, using a supplied file. Returns a hash of the file's contents, which is needed to get memberitems' output. To get output, poll GET /provider/<provider_name>/memberitems/<hash>?method=async """ # logger.debug("Query POSTed to {provider_name}/memberitems with request headers '{headers}'".format( # provider_name=provider_name, # headers=request.headers # )) file = request.files["file"] logger.debug("In provider_memberitems got file") logger.debug("filename = " + file.filename) query = file.read().decode("utf-8") provider = ProviderFactory.get_provider(provider_name) memberitems = MemberItems(provider, myredis) query_hash = memberitems.start_update(query) response_dict = {"query_hash": query_hash} resp = make_response(json.dumps(response_dict), 201) # created resp.mimetype = "application/json" resp.headers["Access-Control-Allow-Origin"] = "*" return resp
def create_item(namespace, nid, myredis, mydao): logger.debug("In create_item with alias" + str((namespace, nid))) item = make() namespace = clean_id(namespace) nid = clean_id(nid) item["aliases"][namespace] = [nid] item["aliases"] = canonical_aliases(item["aliases"]) # set this so we know when it's still updating later on myredis.set_num_providers_left( item["_id"], ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS) ) mydao.save(item) myredis.add_to_alias_queue(item["_id"], item["aliases"]) logger.info("Created new item '{id}' with alias '{alias}'".format( id=item["_id"], alias=str((namespace, nid)) )) mixpanel.track("Create:Item", {"Namespace":namespace}) try: return item["_id"] except AttributeError: abort(500)
def test_get_providers_filters_by_biblio(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "biblio") provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Pubmed']))
def collection_update(cid=""): # first, get the tiids in this collection: try: collection = mydao.get(cid) tiids = collection["alias_tiids"].values() except Exception: logger.exception("couldn't get tiids for collection '{cid}'".format(cid=cid)) abort(404, "couldn't get tiids for this collection...maybe doesn't exist?") # put each of them on the update queue for tiid in tiids: logger.debug("In update_item with tiid " + tiid) # set this so we know when it's still updating later on myredis.set_num_providers_left(tiid, ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS)) item_doc = mydao.get(tiid) try: myredis.add_to_alias_queue(item_doc["_id"], item_doc["aliases"]) except (KeyError, TypeError): logger.debug("couldn't get item_doc for {tiid}. Skipping its update".format(tiid=tiid)) pass resp = make_response("true", 200) resp.mimetype = "application/json" return resp
def setUp(self): self.config = None #placeholder self.TEST_PROVIDER_CONFIG = [ ("wikipedia", {}) ] # hacky way to delete the "ti" db, then make it fresh again for each test. temp_dao = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB")) temp_dao.delete_db(os.getenv("CLOUDANT_DB")) self.d = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB")) # do the same thing for the redis db, set up the test redis database. We're using DB Number 8 self.r = tiredis.from_url("redis://localhost:6379", db=8) self.r.flushdb() provider_queues = {} providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) for provider in providers: provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue") self.b = backend.Backend( backend.RedisQueue("alias-unittest", self.r), provider_queues, [backend.PythonQueue("couch_queue")], self.r) self.fake_item = { "_id": "1", "type": "item", "num_providers_still_updating":1, "aliases":{"pmid":["111"]}, "biblio": {}, "metrics": {} } self.fake_aliases_dict = {"pmid":["222"]} self.tiid = "abcd"
def provider_run(aliases_dict, tiid, method_name, provider_name): provider = ProviderFactory.get_provider(provider_name) # logger.info(u"in provider_run for {provider}".format( # provider=provider.provider_name)) (success, estimated_wait_seconds) = rate.acquire(provider_name, block=False) # add up to random 2 seconds to spread it out estimated_wait_seconds += random.random() * 3 if not success: logger.warning(u"RATE LIMIT HIT in provider_run for {provider} {method_name} {tiid}, retrying".format( provider=provider.provider_name, method_name=method_name, tiid=tiid)) provider_run.retry(args=[aliases_dict, tiid, method_name, provider_name], countdown=estimated_wait_seconds, max_retries=10) timeout_seconds = 30 try: with timeout.Timeout(timeout_seconds): response = provider_method_wrapper(tiid, aliases_dict, provider, method_name) except timeout.Timeout: msg = u"TIMEOUT in provider_run for {provider} {method_name} {tiid} after {timeout_seconds} seconds".format( provider=provider.provider_name, method_name=method_name, tiid=tiid, timeout_seconds=timeout_seconds) # logger.warning(msg) # message is written elsewhere raise ProviderTimeout(msg) return response
def sniffer(item_aliases, provider_config=default_settings.PROVIDERS): (genre, host) = item_module.decide_genre(item_aliases) all_metrics_providers = [provider.provider_name for provider in ProviderFactory.get_providers(provider_config, "metrics")] if (genre == "article") and (host != "arxiv"): run = [[("aliases", provider)] for provider in ["mendeley", "crossref", "pubmed", "altmetric_com"]] run += [[("biblio", provider) for provider in ["crossref", "pubmed", "mendeley", "webpage"]]] run += [[("metrics", provider) for provider in all_metrics_providers]] elif (host == "arxiv") or ("doi" in item_aliases): run = [[("aliases", provider)] for provider in [host, "altmetric_com"]] run += [[("biblio", provider) for provider in [host, "mendeley"]]] run += [[("metrics", provider) for provider in all_metrics_providers]] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] run = [[("aliases", provider)] for provider in relevant_providers] run += [[("biblio", provider) for provider in relevant_providers]] run += [[("metrics", provider) for provider in all_metrics_providers]] return(run)
def get_metric_names(providers_config): full_metric_names = [] providers = ProviderFactory.get_providers(providers_config) for provider in providers: metric_names = provider.metric_names() for metric_name in metric_names: full_metric_names.append(provider.provider_name + ':' + metric_name) return full_metric_names
def test_03_init_aliases(self): providers = ProviderFactory.get_providers(self.config) pat = ProvidersAliasThread(providers, self.d) assert hasattr(pat, "stop") assert hasattr(pat, "stopped") assert hasattr(pat, "first") assert pat.queue is not None
def providers(): metadata = ProviderFactory.get_all_metadata() metadata_list = [] for k, v in metadata.iteritems(): v["name"] = k metadata_list.append(v) return json_resp_from_thing(metadata_list)
def test_get_providers_filters_by_metrics(self): # since all the providers do metrics, "metrics" arg changes nought. providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "metrics") provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS): # default to nothing aliases_providers = [] biblio_providers = [] metrics_providers = [] all_metrics_providers = [ provider.provider_name for provider in ProviderFactory.get_providers( provider_config, "metrics") ] (genre, host) = item_module.decide_genre(item_aliases) has_enough_alias_urls = ("url" in item_aliases) if has_enough_alias_urls: if ("doi" in item_aliases): has_enough_alias_urls = (len([ url for url in item_aliases["url"] if url.startswith("http://dx.doi.org") ]) > 0) if (genre == "article"): if not "mendeley" in aliases_providers_run: aliases_providers = ["mendeley"] elif not "crossref" in aliases_providers_run: aliases_providers = [ "crossref" ] # do this before pubmed because might tease doi from url elif not "pubmed" in aliases_providers_run: aliases_providers = ["pubmed"] else: metrics_providers = all_metrics_providers biblio_providers = ["crossref", "pubmed", "webpage"] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] # if all the relevant providers have already run, then all the aliases are done # or if it already has urls if has_enough_alias_urls or (set(relevant_providers) == set(aliases_providers_run)): metrics_providers = all_metrics_providers biblio_providers = relevant_providers else: aliases_providers = relevant_providers return ({ "aliases": aliases_providers, "biblio": biblio_providers, "metrics": metrics_providers })
def test_get_all_metric_names(self): response = ProviderFactory.get_all_metric_names( self.TEST_PROVIDER_CONFIG) expected = [ 'wikipedia:mentions', 'mendeley:country', 'pubmed:pmc_citations_reviews', 'mendeley:discipline', 'pubmed:f1000', 'mendeley:career_stage', 'pubmed:pmc_citations_editorials', 'mendeley:readers', 'pubmed:pmc_citations', 'mendeley:groups' ] assert_equals(response, expected)
def provider_memberitems(provider_name): """ Make a descr string (like bibtex) into a dict strings describing items. """ provider = ProviderFactory.get_provider(provider_name) items_dict = provider.parse(request.json["descr"]) resp = make_response( json.dumps({"memberitems": items_dict}, sort_keys=True, indent=4), 200) return resp
def provider_memberitems(provider_name): query = request.values.get('query','') logger.debug("In provider_memberitems with " + query) provider = ProviderFactory.get_provider(provider_name) logger.debug("provider: " + provider.provider_name) memberitems = provider.member_items(query, cache_enabled=False) resp = make_response( json.dumps(memberitems, sort_keys=True, indent=4), 200 ) resp.mimetype = "application/json" return resp
def provider_biblio(provider_name, id): provider = ProviderFactory.get_provider(provider_name) if id=="example": id = provider.example_id[1] url = "http://localhost:8080/" + provider_name + "/biblio?%s" else: url = None biblio = provider.get_biblio_for_id(id, url, cache_enabled=False) resp = make_response( json.dumps(biblio, sort_keys=True, indent=4) ) resp.mimetype = "application/json" return resp
def provider_memberitems(provider_name): """ Make a descr string (like bibtex) into a dict strings describing items. """ provider = ProviderFactory.get_provider(provider_name) items_dict = provider.parse(request.json["descr"]) resp = make_response( json.dumps({"memberitems": items_dict}, sort_keys=True, indent=4), 200 ) return resp
def test_alias_queue(self): self.d.create_new_db_and_connect(self.testing_db_name) providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"]) response = self.client.post('/item/doi/' + quote_plus(TEST_DRYAD_DOI)) tiid = json.loads(response.data) # now get it back out response = self.client.get('/item/' + tiid) print tiid assert_equals(response.status_code, 200) resp_dict = json.loads(response.data) assert_equals( set(resp_dict.keys()), set([u'tiid', u'created', u'last_requested', u'metrics', u'last_modified', u'biblio', u'id', u'aliases']) ) assert_equals(unicode(TEST_DRYAD_DOI), resp_dict["aliases"]["doi"][0]) # test the view works res = self.d.view("aliases") assert len(res["rows"]) == 1, res assert_equals(TEST_DRYAD_DOI, res["rows"][0]["value"]["aliases"]["doi"][0]) # see if the item is on the queue my_alias_queue = AliasQueue(self.d) assert isinstance(my_alias_queue.queue, list) assert_equals(len(my_alias_queue.queue), 1) # get our item from the queue my_item = my_alias_queue.first() assert_equals(my_item.aliases.doi[0], TEST_DRYAD_DOI) # do the update using the backend alias_thread = ProvidersAliasThread(providers, self.d) alias_thread.run(run_only_once=True) # get the item back out again and bask in the awesome response = self.client.get('/item/' + tiid) resp_dict = json.loads(response.data) print tiid print response.data assert_equals( resp_dict["aliases"]["title"][0], "data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides" ) print resp_dict assert_equals(resp_dict["biblio"]["data"]["year"], "2010")
def setUp(self): #setup api test client self.app = api.app self.app.testing = True self.client = self.app.test_client() # setup the database self.testing_db_name = "metrics_queue_test" self.old_db_name = self.app.config["DB_NAME"] self.app.config["DB_NAME"] = self.testing_db_name self.d = dao.Dao(self.testing_db_name, self.app.config["DB_URL"], self.app.config["DB_USERNAME"], self.app.config["DB_PASSWORD"]) self.providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"])
def test_get_all_metric_names(self): response = ProviderFactory.get_all_metric_names(self.TEST_PROVIDER_CONFIG) expected = [ "wikipedia:mentions", "mendeley:country", "pubmed:pmc_citations_reviews", "mendeley:discipline", "pubmed:f1000", "mendeley:career_stage", "pubmed:pmc_citations_editorials", "mendeley:readers", "pubmed:pmc_citations", "mendeley:groups", ] assert_equals(response, expected)
def get_metric_value_lists(items): (ordered_fieldnames, rows) = make_csv_rows(items) metric_values = {} for metric_name in ProviderFactory.get_all_metric_names(): if metric_name in ordered_fieldnames: if metric_name in ["tiid", "title", "doi"]: pass else: values = [row[metric_name] for row in rows] values = [value if value else 0 for value in values] # treat "Yes" as 1 for normalizaations values = [1 if value == "Yes" else value for value in values] metric_values[metric_name] = sorted(values, reverse=True) else: metric_values[metric_name] = [0 for row in rows] return metric_values
def get_metric_value_lists(items): (ordered_fieldnames, rows) = make_csv_rows(items) metric_values = {} for metric_name in ProviderFactory.get_all_metric_names(): if metric_name in ordered_fieldnames: if metric_name in ["tiid", "title", "doi"]: pass else: values = [row[metric_name] for row in rows] values = [value if value else 0 for value in values] # treat "Yes" as 1 for normalizaations values = [1 if value=="Yes" else value for value in values] metric_values[metric_name] = sorted(values, reverse=True) else: metric_values[metric_name] = [0 for row in rows] return metric_values
def rq_metrics_for_all_live_profiles(args): url_slug = args.get("url_slug", None) tiid = args.get("tiid", None) no_rq = args.get("no_rq", False) limit = args.get("limit", 5) if url_slug: limit = 1 queue_number = 0 q = db.session.query(Product.tiid).select_from(Profile) q = q.filter(Product.removed == None) q = q.join(Profile.products) if url_slug: q = q.filter(Profile.url_slug==url_slug) elif tiid: q = q.filter(Product.tiid==tiid) else: from totalimpactwebapp.profile import default_free_trial_days min_created_date = datetime.datetime.utcnow() - datetime.timedelta(days=default_free_trial_days) q = q.filter(or_(Profile.is_advisor!=None, Profile.stripe_id!=None, Profile.created>=min_created_date)) # q = q.filter(Profile.next_refresh <= datetime.datetime.utcnow()) q = q.order_by(Product.last_refresh_finished) # oldest first q = q.limit(limit) print "q=", q all_metrics_provider_names = [p.provider_name for p in ProviderFactory.get_providers(default_settings.PROVIDERS, "metrics")] for tiid in q.all(): print "tiid", tiid for provider_name in all_metrics_provider_names: print "putting {} on rq queue to run metrics through {}".format( tiid, provider_name) if no_rq: print "asked for no-rq, so calling right now" provider_method_wrapper(tiid, provider_name, "metrics") else: job = ti_queues[queue_number].enqueue_call( func=provider_method_wrapper, args=(tiid, provider_name, "metrics"), timeout=60 * 10, result_ttl=0 # number of seconds ) job.save()
def create_item(namespace, nid): logger.debug("In create_item with alias" + str((namespace, nid))) item = ItemFactory.make() # set this so we know when it's still updating later on myredis.set_num_providers_left(item["_id"], ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS)) item["aliases"][namespace] = [nid] mydao.save(item) myredis.add_to_alias_queue(item["_id"], item["aliases"]) logger.info("Created new item '{id}' with alias '{alias}'".format(id=item["_id"], alias=str((namespace, nid)))) try: return item["_id"] except AttributeError: abort(500)
def provider_memberitems(provider_name): """ Make a file into a dict strings describing items. """ mixpanel.track("Trigger:Import", {"Provider":provider_name}, request) file = request.files['file'] logger.debug("In"+provider_name+"/memberitems, got file: filename="+file.filename) entries_str = file.read().decode("utf-8") provider = ProviderFactory.get_provider(provider_name) items_dict = provider.parse(entries_str) resp = make_response(json.dumps(items_dict), 200) # created resp.mimetype = "application/json" resp.headers['Access-Control-Allow-Origin'] = "*" return resp
def main(): mydao = None myredis = tiredis.from_url(os.getenv("REDISTOGO_URL")) alias_queue = RedisQueue("aliasqueue", myredis) # to clear alias_queue: #import redis, os #myredis = redis.from_url(os.getenv("REDISTOGO_URL")) #myredis.delete(["aliasqueue"]) # these need to match the tiid alphabet defined in models: couch_queues = {} for i in "abcdefghijklmnopqrstuvwxyz1234567890": couch_queues[i] = PythonQueue(i+"_couch_queue") couch_worker = CouchWorker(couch_queues[i], myredis, mydao) couch_worker.spawn_and_loop() logger.info(u"launched backend couch worker with {i}_couch_queue".format( i=i)) polling_interval = 0.1 # how many seconds between polling to talk to provider provider_queues = {} providers = ProviderFactory.get_providers(default_settings.PROVIDERS) for provider in providers: provider_queues[provider.provider_name] = PythonQueue(provider.provider_name+"_queue") provider_worker = ProviderWorker( provider, polling_interval, alias_queue, provider_queues[provider.provider_name], couch_queues, ProviderWorker.wrapper, myredis) provider_worker.spawn_and_loop() backend = Backend(alias_queue, provider_queues, couch_queues, myredis) try: backend.run_in_loop() # don't need to spawn this one except (KeyboardInterrupt, SystemExit): # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends sys.exit()
def provider_aliases(provider_name, id): provider = ProviderFactory.get_provider(provider_name) if id=="example": id = provider.example_id[1] url = "http://localhost:8080/" + provider_name + "/aliases?%s" else: url = None try: new_aliases = provider._get_aliases_for_id(id, url, cache_enabled=False) except NotImplementedError: new_aliases = [] all_aliases = [(provider.example_id[0], id)] + new_aliases resp = make_response( json.dumps(all_aliases, sort_keys=True, indent=4) ) resp.mimetype = "application/json" return resp
def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS): # default to nothing aliases_providers = [] biblio_providers = [] metrics_providers = [] all_metrics_providers = [provider.provider_name for provider in ProviderFactory.get_providers(provider_config, "metrics")] (genre, host) = item_module.decide_genre(item_aliases) has_enough_alias_urls = ("url" in item_aliases) if has_enough_alias_urls: if ("doi" in item_aliases): has_enough_alias_urls = (len([url for url in item_aliases["url"] if url.startswith("http://dx.doi.org")]) > 0) if (genre == "article"): if not "mendeley" in aliases_providers_run: aliases_providers = ["mendeley"] elif not "crossref" in aliases_providers_run: aliases_providers = ["crossref"] # do this before pubmed because might tease doi from url elif not "pubmed" in aliases_providers_run: aliases_providers = ["pubmed"] else: metrics_providers = all_metrics_providers biblio_providers = ["crossref", "pubmed", "webpage"] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] # if all the relevant providers have already run, then all the aliases are done # or if it already has urls if has_enough_alias_urls or (set(relevant_providers) == set(aliases_providers_run)): metrics_providers = all_metrics_providers biblio_providers = relevant_providers else: aliases_providers = relevant_providers return({ "aliases":aliases_providers, "biblio":biblio_providers, "metrics":metrics_providers})
def start_item_update(tiids, myredis, mydao, sleep_in_seconds=0): # put each of them on the update queue for tiid in tiids: logger.debug("In start_item_update with tiid " + tiid) # set this so we know when it's still updating later on myredis.set_num_providers_left( tiid, ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS) ) item_doc = mydao.get(tiid) try: myredis.add_to_alias_queue(item_doc["_id"], item_doc["aliases"]) except (KeyError, TypeError): logger.debug("couldn't get item_doc for {tiid}. Skipping its update".format( tiid=tiid)) pass time.sleep(sleep_in_seconds)
def provider_memberitems_get(provider_name, query): provider = ProviderFactory.get_provider(provider_name) memberitems = MemberItems(provider, myredis) method = request.args.get("method", "sync") try: ret = getattr(memberitems, "get_" + method)(query) except ProviderItemNotFoundError: abort(404) except ProviderError: abort(500) if ret: if ret["error"]: abort(503) # crossref lookup error, might be transient resp = make_response(json.dumps(ret, sort_keys=True, indent=4), 200) resp.mimetype = "application/json" resp.headers["Access-Control-Allow-Origin"] = "*" return resp
def provider_memberitems_get(provider_name, query): """ Gets aliases associated with a query from a given provider. """ query = unicode_helpers.remove_nonprinting_characters(query) provider = ProviderFactory.get_provider(provider_name) try: items_dict = provider.member_items(query) except ProviderItemNotFoundError: abort_custom(404, "item not found") except (ProviderTimeout, ProviderServerError): abort_custom(503, "crossref lookup error, might be transient") except ProviderError: abort(500, "internal error from provider") resp = make_response( json.dumps({"memberitems": items_dict}, sort_keys=True, indent=4), 200) return resp
def main(): mydao = None myredis = tiredis.from_url(os.getenv("REDISTOGO_URL")) alias_queue = RedisQueue("aliasqueue", myredis) # to clear alias_queue: #import redis, os #myredis = redis.from_url(os.getenv("REDISTOGO_URL")) #myredis.delete(["aliasqueue"]) # these need to match the tiid alphabet defined in models: couch_queues = {} for i in "abcdefghijklmnopqrstuvwxyz1234567890": couch_queues[i] = PythonQueue(i + "_couch_queue") couch_worker = CouchWorker(couch_queues[i], myredis, mydao) couch_worker.spawn_and_loop() logger.info( u"launched backend couch worker with {i}_couch_queue".format(i=i)) polling_interval = 0.1 # how many seconds between polling to talk to provider provider_queues = {} providers = ProviderFactory.get_providers(default_settings.PROVIDERS) for provider in providers: provider_queues[provider.provider_name] = PythonQueue( provider.provider_name + "_queue") provider_worker = ProviderWorker( provider, polling_interval, alias_queue, provider_queues[provider.provider_name], couch_queues, ProviderWorker.wrapper, myredis) provider_worker.spawn_and_loop() backend = Backend(alias_queue, provider_queues, couch_queues, myredis) try: backend.run_in_loop() # don't need to spawn this one except (KeyboardInterrupt, SystemExit): # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends sys.exit()
def importer_post(provider_name): """ Gets aliases associated with a query from a given provider. """ input_string = request.json["input"] if provider_name == "pmids": provider_name = "pubmed" elif provider_name == "dois": provider_name = "crossref" elif provider_name == "urls": provider_name = "webpage" try: provider = ProviderFactory.get_provider(provider_name) except ImportError: abort_custom( 404, "an importer for provider '{provider_name}' is not found".format( provider_name=provider_name)) try: aliases = provider.member_items(input_string) except ProviderItemNotFoundError: abort_custom(404, "item not found") except (ProviderTimeout, ProviderServerError): abort_custom(503, "timeout error, might be transient") except ProviderError: abort(500, "internal error from provider") tiids_aliases_map = item_module.create_tiids_from_aliases(aliases, myredis) logger.debug(u"in provider_importer_get with {tiids_aliases_map}".format( tiids_aliases_map=tiids_aliases_map)) products_dict = format_into_products_dict(tiids_aliases_map) resp = make_response( json.dumps({"products": products_dict}, sort_keys=True, indent=4), 200) return resp
def provider_memberitems_get(provider_name, query): """ Gets aliases associated with a query from a given provider. """ mixpanel.track("Trigger:Import", {"Provider":provider_name}, request) try: provider = ProviderFactory.get_provider(provider_name) ret = provider.member_items(query) except ProviderItemNotFoundError: abort(404) except (ProviderTimeout, ProviderServerError): abort(503) # crossref lookup error, might be transient except ProviderError: abort(500) resp = make_response( json.dumps({"memberitems":ret}, sort_keys=True, indent=4), 200 ) resp.mimetype = "application/json" resp.headers['Access-Control-Allow-Origin'] = "*" return resp
def setUp(self): self.provider = ProviderFactory.get_provider(self.provider_name) self.old_http_get = Provider.http_get
def start_item_update(tiid, aliases_dict, myredis): logger.debug(u"In start_item_update with {tiid}, /biblio_print {aliases_dict}".format( tiid=tiid, aliases_dict=aliases_dict)) myredis.set_num_providers_left(tiid, ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS)) myredis.add_to_alias_queue(tiid, aliases_dict)
def provider(): ret = ProviderFactory.get_all_metadata() resp = make_response(json.dumps(ret, sort_keys=True, indent=4), 200) return resp
from totalimpact import json_sqlalchemy from totalimpact import db # Master lock to ensure that only a single thread can write # to the DB at one time to avoid document conflicts import logging logger = logging.getLogger('ti.item') # print out extra debugging #logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) all_static_meta = ProviderFactory.get_all_static_meta() class NotAuthenticatedError(Exception): pass def delete_item(tiid): item_object = Item.from_tiid(tiid) db.session.delete(item_object) try: db.session.commit() except (IntegrityError, FlushError) as e: db.session.rollback() logger.warning(u"Fails Integrity check in delete_item for {tiid}, rolling back. Message: {message}".format( tiid=tiid,
def test_get_all_metadata(self): md = ProviderFactory.get_all_metadata(self.TEST_PROVIDER_CONFIG) print md["pubmed"] assert_equals(md["pubmed"]['url'], 'http://pubmed.gov')
def test_get_all_static_meta(self): sm = ProviderFactory.get_all_static_meta(self.TEST_PROVIDER_CONFIG) expected = 'The number of citations by papers in PubMed Central' assert_equals(sm["pubmed:pmc_citations"]["description"], expected)
def test_get_provider(self): provider = ProviderFactory.get_provider("wikipedia") assert_equals(provider.__class__.__name__, "Wikipedia")