def cache_api_response(my_saved_object): entity_term = my_saved_object.entity_title entity_term = entity_term.replace(u" ", u"_") url = u"https://gtr-api.herokuapp.com/search/{}?automated=true&nocache=true".format( entity_term) r = requests.get(url) print r print url my_saved_object.api_response = r.json() flag_modified(my_saved_object, "api_response" ) # required, to force sqlalchemy to update because jsonb url = u"https://gtr-api.herokuapp.com/search/{}?oa=true&automated=true&nocache=true".format( entity_term) r = requests.get(url) print r print url my_saved_object.api_response_oa_only = r.json() flag_modified(my_saved_object, "api_response_oa_only" ) # required, to force sqlalchemy to update because jsonb my_saved_object.collected = datetime.datetime.utcnow() db.session.merge(my_saved_object) safe_commit(db) print ".", return
def package_create(jusp_id, institution_id, package_type): jisc_package_id = u"package-jiscels{}".format(jusp_id) package_id = u"package-n8els_{}_{}".format(jusp_id, package_type.replace(" ", "")) package_name = u"Elsevier n8 ({})".format(package_type) scenario_id = u"scenario-n8els_{}_{}".format(jusp_id, package_type.replace(" ", "")) scenario_name = u"n8 ({})".format(package_type) my_package = Package.query.get(package_id) if not my_package: print u"package {} doesn't exist, making".format(package_id) my_package = Package(package_id=package_id, publisher="Elsevier", package_name=package_name, created=datetime.datetime.utcnow().isoformat(), institution_id=institution_id, is_demo=False, currency="GBP") db.session.add(my_package) print my_package safe_commit(db) if package_type == "own pta": copy_into_n8_package(old_package_id=jisc_package_id, new_package_id=package_id, copy_perpetual_access=True) elif package_type == "group pta": copy_into_n8_package(old_package_id=jisc_package_id, new_package_id=package_id, copy_perpetual_access=False) elif package_type == "uk pta": copy_into_n8_package(old_package_id=jisc_package_id, new_package_id=package_id, copy_perpetual_access=False) my_scenario = SavedScenario.query.get(scenario_id) if not my_scenario: print u"scenario {} doesn't exist, making".format(scenario_id) my_scenario = SavedScenario(False, scenario_id, None) my_scenario.package_id = package_id my_scenario.created = datetime.datetime.utcnow().isoformat() db.session.add(my_scenario) safe_commit(db) print "updating settings, including big deal cost from jisc package" big_deal_price = get_sql_answer( db, "select big_deal_cost from jump_account_package where package_id = '{}';" .format(jisc_package_id)) dict_to_save = my_scenario.to_dict_saved_from_db() dict_to_save["name"] = scenario_name dict_to_save["configs"]["cost_bigdeal"] = big_deal_price dict_to_save["configs"]["cost_bigdeal_increase"] = 2 dict_to_save["configs"]["include_social_networks"] = True # set to true dict_to_save["configs"]["weight_authorship"] = 0 # 100 dict_to_save["configs"]["weight_citation"] = 0 # 10 save_raw_scenario_to_db(scenario_id, dict_to_save, None)
def check_pdf_urls(pdf_urls): for url in pdf_urls: make_transient(url) # free up the connection while doing net IO safe_commit(db) db.engine.dispose() req_pool = get_request_pool() checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1) req_pool.close() req_pool.join() row_dicts = [x.__dict__ for x in checked_pdf_urls] for row_dict in row_dicts: row_dict.pop('_sa_instance_state') db.session.bulk_update_mappings(PdfUrl, row_dicts) start_time = time() commit_success = safe_commit(db) if not commit_success: logger.info(u"COMMIT fail") logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
def add_new_log_temp_profile(my_temp_person, request=None): if LogTempProfile.query.get(my_temp_person.orcid_id): return new_log = LogTempProfile(my_temp_person, request) db.session.add(new_log) safe_commit(db)
def set_coauthors(self): # commit first, to make sure fresh session etc safe_commit(db) # now go for it print u"running coauthors for {}".format(self.orcid_id) coauthor_orcid_id_query = u"""select distinct orcid_id from product where doi in (select doi from product where orcid_id='{}')""".format( self.orcid_id) rows = db.engine.execute(text(coauthor_orcid_id_query)) orcid_ids = [row[0] for row in rows] coauthors = Person.query.filter(Person.orcid_id.in_(orcid_ids)).all() resp = {} for coauthor in coauthors: if coauthor.id != self.id: resp[coauthor.orcid_id] = { "name": coauthor.full_name, "id": coauthor.id, "orcid_id": coauthor.orcid_id, "openness_perc": coauthor.display_openness_perc, "engagement_perc": coauthor.display_engagement_perc, "buzz_perc": coauthor.display_buzz_perc } self.coauthors = resp
def add_new_log(my_temp_person, request=None): if LogTempProfile.query.get(my_temp_person.orcid_id): return new_log = LogTempProfile(my_temp_person, request) db.session.add(new_log) safe_commit(db)
def from_bq_overwrite_data(db_tablename, bq_tablename): temp_data_filename = 'data_export.csv' column_names = from_bq_to_local_file(temp_data_filename, bq_tablename, header=False) print "column_names", column_names print "\n" cursor = db.session.connection().connection.cursor() cursor.execute(u"truncate {};".format(db_tablename)) # replace quoted tabs with just a tab, because the quote is there by mistake # temp_data_cleaned_filename = 'data_export_cleaned.csv' # o = open(temp_data_cleaned_filename,"w") # data = open(temp_data_filename).read() # o.write(re.sub("\t", "|", re.sub("|"," ", data))) # o.close() with open(temp_data_filename, "rb") as f: cursor.copy_from(f, db_tablename, sep='\t', columns=column_names, null="") # this commit is necessary safe_commit(db)
def add_endpoint(my_request): if not my_request.pmh_url: return None endpoint_with_this_id = Endpoint.query.filter( Endpoint.repo_request_id == my_request.id).first() if endpoint_with_this_id: print u"one already matches {}".format(my_request.id) return None raw_endpoint = my_request.pmh_url clean_endpoint = raw_endpoint.strip() clean_endpoint = clean_endpoint.strip("?") clean_endpoint = re.sub(u"\?verb=.*$", "", clean_endpoint, re.IGNORECASE) print u"raw endpoint is {}, clean endpoint is {}".format( raw_endpoint, clean_endpoint) matching_endpoint = Endpoint() matching_endpoint.pmh_url = clean_endpoint repo_matches = my_request.matching_repositories() if repo_matches: matching_repo = repo_matches[0] print u"yay! for {} {} matches repository {}".format( my_request.institution_name, my_request.repo_name, matching_repo) else: print u"no matching repository for {}: {}".format( my_request.institution_name, my_request.repo_name) matching_repo = Repository() # overwrite stuff with request matching_repo.institution_name = my_request.institution_name matching_repo.repository_name = my_request.repo_name matching_repo.home_page = my_request.repo_home_page matching_endpoint.repo_unique_id = matching_repo.id matching_endpoint.email = my_request.email matching_endpoint.repo_request_id = my_request.id matching_endpoint.ready_to_run = True matching_endpoint.set_identify_and_initial_query() db.session.merge(matching_endpoint) db.session.merge(matching_repo) print u"added {} {}".format(matching_endpoint, matching_repo) print u"see at url http://unpaywall.org/sources/repository/{}".format( matching_endpoint.id) safe_commit(db) print "saved" print "now sending email" # get the endpoint again, so it gets with all the meta info etc matching_endpoint = Endpoint.query.get(matching_endpoint.id) matching_endpoint.contacted_text = "automated welcome email" matching_endpoint.contacted = datetime.datetime.utcnow().isoformat() safe_commit(db) send_announcement_email(matching_endpoint) print "email sent" return matching_endpoint
def add_pubs_or_update_crossref(pubs): if not pubs: return [] pubs_by_id = dict((p.id, p) for p in pubs) existing_pub_ids = set([ id_tuple[0] for id_tuple in db.session.query(Pub.id).filter( Pub.id.in_(list(pubs_by_id.keys()))).all() ]) pubs_to_add = [p for p in pubs if p.id not in existing_pub_ids] pubs_to_update = [p for p in pubs if p.id in existing_pub_ids] if pubs_to_add: logger.info("adding {} pubs".format(len(pubs_to_add))) db.session.add_all(pubs_to_add) if pubs_to_update: row_dicts = [{ 'id': p.id, 'crossref_api_raw_new': p.crossref_api_raw_new } for p in pubs_to_update] logger.info("updating {} pubs".format(len(pubs_to_update))) db.session.bulk_update_mappings(Pub, row_dicts) safe_commit(db) return pubs_to_add
def commit_repo(repo): try: db.session.commit() except DataError: print "error committing repo, rolling back and setting save error for ", repo db.session.rollback() repo.set_save_error() safe_commit(db)
def get_pub_from_biblio(biblio, run_with_hybrid=False, skip_all_hybrid=False): my_pub = lookup_product(**biblio) if run_with_hybrid: my_pub.run_with_hybrid() safe_commit(db) else: my_pub.recalculate() return my_pub
def add_endpoint(my_request): if not my_request.pmh_url: return None endpoint_with_this_id = Endpoint.query.filter(Endpoint.repo_request_id==my_request.id).first() if endpoint_with_this_id: print u"one already matches {}".format(my_request.id) return None raw_endpoint = my_request.pmh_url clean_endpoint = raw_endpoint.strip() clean_endpoint = clean_endpoint.strip("?") clean_endpoint = re.sub(u"\?verb=.*$", "", clean_endpoint, re.IGNORECASE) print u"raw endpoint is {}, clean endpoint is {}".format(raw_endpoint, clean_endpoint) matching_endpoint = Endpoint() matching_endpoint.pmh_url = clean_endpoint repo_matches = my_request.matching_repositories() if repo_matches: matching_repo = repo_matches[0] print u"yay! for {} {} matches repository {}".format( my_request.institution_name, my_request.repo_name, matching_repo) else: print u"no matching repository for {}: {}".format( my_request.institution_name, my_request.repo_name) matching_repo = Repository() # overwrite stuff with request matching_repo.institution_name = my_request.institution_name matching_repo.repository_name = my_request.repo_name matching_repo.home_page = my_request.repo_home_page matching_endpoint.repo_unique_id = matching_repo.id matching_endpoint.email = my_request.email matching_endpoint.repo_request_id = my_request.id matching_endpoint.ready_to_run = True matching_endpoint.set_identify_and_initial_query() db.session.merge(matching_endpoint) db.session.merge(matching_repo) print u"added {} {}".format(matching_endpoint, matching_repo) print u"see at url http://unpaywall.org/sources/repository/{}".format(matching_endpoint.id) safe_commit(db) print "saved" print "now sending email" # get the endpoint again, so it gets with all the meta info etc matching_endpoint = Endpoint.query.get(matching_endpoint.id) matching_endpoint.contacted_text = "automated welcome email" matching_endpoint.contacted = datetime.datetime.utcnow().isoformat() safe_commit(db) send_announcement_email(matching_endpoint) print "email sent" return matching_endpoint
def maint(self, **kwargs): # endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response==None, Endpoint.error==None).all() endpoints = Endpoint.query.filter( Endpoint.repo_request_id != None).all() shuffle(endpoints) for my_endpoint in endpoints: my_endpoint.run_diagnostics() logger.info(u"my_endpoint: {}".format(my_endpoint)) db.session.merge(my_endpoint) safe_commit(db)
def post_gs_cache(**kwargs): my_doi = clean_doi(kwargs["doi"]) q = Gs.query.filter(Gs.doi == my_doi, Gs.landing_page_url == kwargs["landing_page_url"]) my_gs = q.first() if not my_gs: my_gs = Gs(**kwargs) db.session.add(my_gs) safe_commit(db) return my_gs
def update_refsets(): from models.person import Person print u"getting the badge percentile refsets...." # only get out the badge objects q = db.session.query(Person).options( Load(Person).load_only("campaign", "orcid_id")) q = q.options(orm.noload('*')) q = q.options(orm.subqueryload("badges")) # limit to just what we want for the refset q = refine_refset_query(q) # and do the get rows = q.all() print u"query finished, now set the values in the lists" refset_list_dict = defaultdict(list) for person in rows: for badge in person.badges: # print "BADGE", badge # handle the nones below, with the zeros if badge.value != None: refset_list_dict[badge.name].append(badge.value) num_in_refset = num_people_in_refset() for name, unsorted_values in refset_list_dict.iteritems(): print u"refreshing refset {}".format(name) assigner = get_badge_assigner(name) if assigner.pad_percentiles_with_zeros: # pad with zeros for all the people who didn't get the badge unsorted_values.extend([0] * (num_in_refset - len(unsorted_values))) # now sort refset_list_dict[name] = sorted(unsorted_values) # now pick out the cutoffs, minimum value at each of 100 cutoffs = [] for sublist in chunk_into_n_sublists(refset_list_dict[name], 100): sublist_values = sublist if sublist_values: cutoffs.append(min(sublist_values)) this_badge_refset = Refset(name=name, cutoffs=cutoffs) print u"saving refset {} with cutoffs {}".format(name, cutoffs) db.session.merge(this_badge_refset) # and finally save it all safe_commit(db)
def get_pub_from_biblio(biblio, force_refresh=False): my_pub = lookup_product_in_db(**biblio) if not my_pub: my_pub = build_publication(**biblio) if force_refresh or not my_pub.evidence: my_pub.refresh() db.session.merge(my_pub) safe_commit(db) return my_pub
def get_pubs_from_biblio(biblios, force_refresh=False): threads = [] returned_pubs = [] for biblio in biblios: process = Thread(target=thread_result_wrapper, args=[get_pub_from_biblio, (biblio, force_refresh), returned_pubs]) process.start() threads.append(process) for process in threads: process.join(timeout=30) safe_commit(db) return returned_pubs
def modify_profile_endpoint(orcid_id): my_person = Person.query.filter_by(orcid_id=orcid_id).first() product_id = request.json["product"]["id"] my_product = next(my_product for my_product in my_person.products if my_product.id==product_id) url = request.json["product"]["fulltext_url"] my_product.set_oa_from_user_supplied_fulltext_url(url) my_person.recalculate_openness() safe_commit(db) return json_resp(my_person.to_dict())
def maint(self, **kwargs): if parsed_args.id: endpoints = Endpoint.query.filter(Endpoint.id == parsed_args.id).all() else: # endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response==None, Endpoint.error==None).all() endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response == None).all() shuffle(endpoints) for my_endpoint in endpoints: my_endpoint.run_diagnostics() db.session.merge(my_endpoint) safe_commit(db) logger.info(u"merged and committed my_endpoint: {}".format(my_endpoint))
def get_or_make_person(**kwargs): res = None if 'name' in kwargs and kwargs["name"] == "UNKNOWN": # pypi sets unknown people to have the name "UNKNOWN" # we don't want to make tons of these, it's just one 'person'. res = db.session.query(Person).filter(Person.name == "UNKNOWN").first() if 'name' in kwargs and kwargs["name"] == "ORPHANED": # cran sets this when the maintainer is gone. # we don't want to make tons of these, it's just one 'person'. res = db.session.query(Person).filter( Person.name == "ORPHANED").first() if res is not None: return res or_filters = [] if "github_login" in kwargs and kwargs["github_login"]: or_filters.append(Person.github_login == kwargs["github_login"]) elif "email" in kwargs and kwargs["email"]: or_filters.append(Person.email == kwargs["email"]) elif "name" in kwargs and kwargs["name"]: incoming_parsed_name = HumanName(kwargs["name"]) dict_for_matching = { "first": incoming_parsed_name.first, "last": incoming_parsed_name.last } or_filters.append(Person.parsed_name.contains(dict_for_matching)) if or_filters: query = db.session.query(Person).filter(or_(*or_filters)) persons = query.all() res = find_best_match(persons, **kwargs) if res is not None: return res else: print u"minting a new person using {}".format(kwargs) new_person = force_make_person(**kwargs) #need this commit to handle matching people added previously in this chunk db.session.add(new_person) safe_commit() return new_person
def add_all_new_packages(package_class): all_current_package_id_rows = db.session.query(package_class.id).all() all_current_package_ids = [row[0] for row in all_current_package_id_rows] all_names = package_class.get_all_live_package_names() for package_name in all_names: new_package = package_class(project_name=package_name) if new_package.id not in all_current_package_ids: print "\n\nadded new package:", new_package.id # new_package.refresh() db.session.add(new_package) safe_commit(db) print len(all_names)
def delete_person(orcid_id): Person.query.filter_by(orcid_id=orcid_id).delete() badge.Badge.query.filter_by(orcid_id=orcid_id).delete() product.Product.query.filter_by(orcid_id=orcid_id).delete() commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id)
def set_person_email(orcid_id, email, high_priority=False): my_person = Person.query.filter_by(orcid_id=orcid_id).first() my_person.email = email db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id)
def add_pmh_record(self, **kwargs): endpoint_id = kwargs.get("id", None) record_id = kwargs.get("recordid") my_repo = Endpoint.query.get(endpoint_id) print "my_repo", my_repo my_pmh_record = my_repo.get_pmh_record(record_id) my_pmh_record.mint_pages() # for my_page in my_pmh_record.pages: # print "my_page", my_page # my_page.scrape() my_pmh_record.delete_old_record() db.session.merge(my_pmh_record) # print my_pmh_record.pages safe_commit(db)
def pull_from_orcid(orcid_id, high_priority=False): my_person = Person.query.filter_by(orcid_id=orcid_id).first() my_person.refresh(refsets, high_priority=high_priority) db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id)
def save_openness_log(my_person): # make a new log new_openness_log = LogOpenness() new_openness_log.set_openness_columns(my_person) # see if we already have a log the same as this. if so, nothing to do, return. q = LogOpenness.query.filter_by(orcid_id=my_person.orcid_id).order_by( LogOpenness.created.desc()) most_recent_log = q.first() if most_recent_log: if new_openness_log.has_same_openness(most_recent_log): print u"no new openness to log for {}".format(my_person.orcid_id) return # nope! is worth logging. finish adding attributes and store in db new_openness_log.id = shortuuid.uuid()[0:10] new_openness_log.created = datetime.datetime.utcnow().isoformat() new_openness_log.orcid_id = my_person.orcid_id db.session.add(new_openness_log) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on new_openness_log {}".format( new_openness_log.orcid_id) print u"logged new openness for {}".format(my_person.orcid_id) return
def link_twitter(orcid_id, twitter_creds): my_person = Person.query.filter_by(orcid_id=orcid_id).first() my_person.twitter_creds = twitter_creds oauth = OAuth1Session( os.getenv('TWITTER_CONSUMER_KEY'), client_secret=os.getenv('TWITTER_CONSUMER_SECRET'), resource_owner_key=twitter_creds["oauth_token"], resource_owner_secret=twitter_creds["oauth_token_secret"] ) url = "https://api.twitter.com/1.1/account/verify_credentials.json?include_email=true" r = oauth.get(url) full_twitter_profile = r.json() # print "we got this back from Twitter!", full_twitter_profile full_twitter_profile.update(twitter_creds) my_person.twitter_creds = full_twitter_profile if my_person.email is None: my_person.email = full_twitter_profile["email"] my_person.twitter = full_twitter_profile["screen_name"] commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id) return my_person
def load_campaign(filename, campaign=None, limit=None): with open("data/" + filename, "r") as f: lines = f.read().split("\n") print "found {} ORCID lines".format(len(lines)) print len(lines) if limit: lines = lines[:limit] total_start = time() row_num = 0 for line in lines: row_num += 1 # can have # as comments if line.startswith("#"): print "skipping comment line" continue loop_start = time() email = None if "," in line: (dirty_orcid, email, twitter) = line.split(",") else: dirty_orcid = line try: orcid_id = clean_orcid(dirty_orcid) except NoOrcidException: try: print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format( line) except UnicodeDecodeError: print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n" continue my_person = Person.query.filter_by(orcid_id=orcid_id).first() if my_person: print u"row {}, already have person {}, skipping".format( row_num, orcid_id) else: print u"row {}, making person {}".format(row_num, orcid_id) my_person = make_person(orcid_id, high_priority=False) my_person.campaign = campaign my_person.email = email my_person.twitter = twitter db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(my_person.orcid_id) print "row {}: finished {} in {}s\n".format(row_num, orcid_id, elapsed(loop_start)) print "finished load_campaign on {} profiles in {}s\n".format( len(lines), elapsed(total_start))
def _copy_raw_to_s3(self, filename, package_id, num_rows=None, error=None, error_details=None): if u"." in filename: suffix = u".{}".format(filename.split(u".")[-1]) else: suffix = u"" object_name = "{}_{}{}".format(package_id, self.file_type_label(), suffix) bucket_name = self._raw_s3_bucket() s3_client.upload_file(filename, bucket_name, object_name) with get_db_cursor() as cursor: command = "delete from jump_raw_file_upload_object where package_id = '{}' and file = '{}'".format( package_id, self.file_type_label()) cursor.execute(command) if error and not error_details: error_details_dict = { "no_useable_rows": "No usable rows found.", "error_reading_file": "Error reading this file. Try opening this file, save in .xlsx format, and upload that." } error_details = error_details_dict.get( "error", "Error processing file. Please email this file to [email protected] so the Unsub team can look into the problem." ) new_object = RawFileUploadObject(package_id=package_id, file=self.file_type_label(), bucket_name=bucket_name, object_name=object_name, num_rows=num_rows, error=error, error_details=error_details) db.session.add(new_object) safe_commit(db) return "s3://{}/{}".format(bucket_name, object_name)
def refresh_profile(orcid_id, high_priority=False): my_person = Person.query.filter_by(orcid_id=orcid_id).first() my_person.refresh(high_priority=high_priority) db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id) return my_person
def update_refsets(): print u"getting the badge percentile refsets...." refset_list_dict = defaultdict(list) q = db.session.query( Badge.name, Badge.value, ) q = q.filter(Badge.value != None) rows = q.all() print u"query finished, now set the values in the lists" for row in rows: if row[1]: refset_list_dict[row[0]].append(row[1]) num_in_refset = num_people_in_db() for name, unsorted_values in refset_list_dict.iteritems(): print u"refreshing refset {}".format(name) assigner = get_badge_assigner(name) if assigner.pad_percentiles_with_zeros: # pad with zeros for all the people who didn't get the badge unsorted_values.extend([0] * (num_in_refset - len(unsorted_values))) # now sort # for testing!!! refset_list_dict[name] = sorted(unsorted_values) # refset_list_dict[name] = sorted(unsorted_values[0:200]) # now pick out the cutoffs, minimum value at each of 100 cutoffs = [] for sublist in chunk_into_n_sublists(refset_list_dict[name], 100): sublist_values = sublist if sublist_values: cutoffs.append(min(sublist_values)) this_badge_refset = Refset(name=name, cutoffs=cutoffs) print u"saving refset {} with cutoffs {}".format(name, cutoffs) db.session.merge(this_badge_refset) # and finally save it all safe_commit(db)
def set_scores(self): self.pagerank = 0 self.num_downloads = 0 self.num_citations = 0 for pp in self.get_person_packages(): # only count up academic packages if pp.package.is_academic: # only count up impact for packages in our main language if pp.package.language == self.main_language: if pp.person_package_pagerank: self.pagerank += pp.person_package_pagerank if pp.person_package_num_downloads: self.num_downloads += pp.person_package_num_downloads if pp.person_package_num_citations: self.num_citations += pp.person_package_num_citations safe_commit(db)
def make_person(orcid_id, high_priority=False): my_person = Person(orcid_id=orcid_id) db.session.add(my_person) print u"\nmade new person for {}".format(orcid_id) my_person.refresh(refsets, high_priority=high_priority) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id) return my_person
def add_pmh_record(self, **kwargs): endpoint_id = kwargs.get("id", None) record_id = kwargs.get("recordid") my_repo = Endpoint.query.get(endpoint_id) print "my_repo", my_repo my_pmh_record = my_repo.get_pmh_record(record_id) print "my_pmh_record", my_pmh_record my_pmh_record.mint_pages() # for my_page in my_pmh_record.pages: # print "my_page", my_page # my_page.scrape() db.session.merge(my_pmh_record) # print my_pmh_record.pages safe_commit(db)
def worker_run(self, **kwargs): chunk_size = kwargs.get("chunk", 100) limit = kwargs.get("limit", None) queue_no = kwargs.get("queue", 0) if limit is None: limit = float("inf") index = 0 num_updated = 0 start_time = time() while num_updated < limit: new_loop_start_time = time() objects = self.fetch_queue_chunk(chunk_size, queue_no) if not objects: sleep(5) continue for o in objects: o.refresh() finish_batch_text = u''' update {queue_table} set finished = now(), started = null, priority = null where id = any(:ids)'''.format( queue_table=self.table_name(None)) finish_batch_command = text(finish_batch_text).bindparams( ids=[o.id for o in objects]) db.session.execute(finish_batch_command) commit_start_time = time() safe_commit(db) or logger.info(u"COMMIT fail") logger.info(u"commit took {} seconds".format( elapsed(commit_start_time, 2))) index += 1 num_updated += chunk_size self.print_update(new_loop_start_time, len(objects), limit, start_time, index)
def run(retry_apis): start = time() journal_ids = db.session.query(Journal.issn_l).filter( or_( missing_field_filter(Journal.api_raw_crossref, retry_apis), missing_field_filter(Journal.api_raw_issn, retry_apis), )).all() logger.info('trying to update {} journals'.format(len(journal_ids))) chunk_size = 50 for i in range(0, len(journal_ids), chunk_size): id_chunk = journal_ids[i:i + chunk_size] journals = Journal.query.filter(Journal.issn_l.in_(id_chunk)).all() for journal in journals: # try all issns, issn-l first issns = set(journal.issns) issns.discard(journal.issn_l) issns = [journal.issn_l] + list(issns) if journal.api_raw_crossref is None or ( retry_apis and journal.api_raw_crossref == {}): logger.info('getting crossref api response for {}'.format( journal.issn_l)) journal.api_raw_crossref = get_first_response( call_crossref_api, issns) or {} if journal.api_raw_issn is None or (retry_apis and journal.api_raw_issn == {}): logger.info('getting issn api response for {}'.format( journal.issn_l)) journal.api_raw_issn = get_first_response( call_issn_api, issns) or {} db.session.merge(journal) safe_commit(db) db.session.remove() logger.info('finished update in {}'.format( timedelta(seconds=elapsed(start))))
def load_campaign(filename, campaign=None, limit=None): with open("data/" + filename, "r") as f: lines = f.read().split("\n") print "found {} ORCID lines".format(len(lines)) print len(lines) if limit: lines = lines[:limit] total_start = time() row_num = 0 for line in lines: row_num += 1 # can have # as comments if line.startswith("#"): print "skipping comment line" continue loop_start = time() email = None if "," in line: (dirty_orcid, email, twitter) = line.split(",") else: dirty_orcid = line try: orcid_id = clean_orcid(dirty_orcid) except NoOrcidException: try: print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(line) except UnicodeDecodeError: print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n" continue my_person = Person.query.filter_by(orcid_id=orcid_id).first() if my_person: print u"row {}, already have person {}, skipping".format(row_num, orcid_id) else: print u"row {}, making person {}".format(row_num, orcid_id) my_person = make_person(orcid_id, store_in_db=True) my_person.campaign = campaign my_person.email = email my_person.twitter = twitter db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(my_person.orcid_id) print "row {}: finished {} in {}s\n".format(row_num, orcid_id, elapsed(loop_start)) print "finished load_campaign on {} profiles in {}s\n".format(len(lines), elapsed(total_start))
def save_email(orcid_id, contents): email = LogEmail() email.id = shortuuid.uuid()[0:10] email.sent = datetime.datetime.utcnow().isoformat() email.orcid_id = orcid_id email.contents = dict(contents) db.session.add(email) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on email {}".format(email.orcid_id)
def save_email(orcid_id, contents): email = Email() email.id = shortuuid.uuid()[0:10] email.sent = datetime.datetime.utcnow().isoformat() email.orcid_id = orcid_id email.contents = dict(contents) db.session.add(email) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on email {}".format(email.orcid_id)
def tweeted_quickly(orcid_id): my_person = Person.query.filter_by(orcid_id=orcid_id).first() if not my_person: print u"returning 404: orcid profile {} does not exist".format(orcid_id) abort_json(404, "That ORCID profile doesn't exist") my_person.tweeted_quickly = True success = safe_commit(db) return json_resp({"resp": "success"})
def delete_person(orcid_id): # also need delete all the badges, products product.Product.query.filter_by(orcid_id=orcid_id).delete() badge.Badge.query.filter_by(orcid_id=orcid_id).delete() # and now delete the person. have to do this after deleting the stuff above. Person.query.filter_by(orcid_id=orcid_id).delete() commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id)
def update_fn(self, cls, method_name, objects, index=1): # we are in a fork! dispose of our engine. # will get a new one automatically # if is pooling, need to do .dispose() instead db.engine.dispose() start = time() num_obj_rows = len(objects) # logger.info(u"{pid} {repr}.{method_name}() got {num_obj_rows} objects in {elapsed} seconds".format( # pid=os.getpid(), # repr=cls.__name__, # method_name=method_name, # num_obj_rows=num_obj_rows, # elapsed=elapsed(start) # )) for count, obj in enumerate(objects): start_time = time() if obj is None: return None method_to_run = getattr(obj, method_name) # logger.info(u"***") logger.info(u"*** #{count} starting {repr}.{method_name}() method".format( count=count + (num_obj_rows*index), repr=obj, method_name=method_name )) method_to_run() logger.info(u"finished {repr}.{method_name}(). took {elapsed} seconds".format( repr=obj, method_name=method_name, elapsed=elapsed(start_time, 4) )) # for handling the queue if not (method_name == "update" and obj.__class__.__name__ == "Pub"): obj.finished = datetime.datetime.utcnow().isoformat() # db.session.merge(obj) start_time = time() commit_success = safe_commit(db) if not commit_success: logger.info(u"COMMIT fail") logger.info(u"commit took {} seconds".format(elapsed(start_time, 2))) db.session.remove() # close connection nicely return None # important for if we use this on RQ
def save_repo_request_rows(rows): with open('out.csv','wb') as f: w = csv.DictWriter(f, fieldnames=RepoRequest.list_fieldnames(), encoding='utf-8-sig') for row in rows[1:]: # skip header row my_repo_request = RepoRequest() my_repo_request.set_id_seed(row[0]) column_num = 0 for fieldname in RepoRequest.list_fieldnames(): if fieldname != "id": setattr(my_repo_request, fieldname, row[column_num]) column_num += 1 w.writerow(my_repo_request.to_dict()) print u"adding repo request {}".format(my_repo_request) db.session.merge(my_repo_request) safe_commit(db)
def make_person(dirty_orcid_id, high_priority=False): orcid_id = clean_orcid(dirty_orcid_id) my_person = Person(orcid_id=orcid_id) db.session.add(my_person) print u"\nin make_person: made new person for {}".format(orcid_id) my_person.refresh(high_priority=high_priority) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id) if my_person.invalid_orcid: raise OrcidDoesNotExist return my_person
def update_fn(cls, method_name, obj_id_list, shortcut_data=None, index=1): # we are in a fork! dispose of our engine. # will get a new one automatically db.engine.dispose() start = time() q = db.session.query(cls).options(orm.undefer('*')).filter(cls.id.in_(obj_id_list)) obj_rows = q.all() num_obj_rows = len(obj_rows) print "{repr}.{method_name}() got {num_obj_rows} objects in {elapsed}sec".format( repr=cls.__name__, method_name=method_name, num_obj_rows=num_obj_rows, elapsed=elapsed(start) ) for count, obj in enumerate(obj_rows): start_time = time() if obj is None: return None method_to_run = getattr(obj, method_name) print u"\n***\n{count}: starting {repr}.{method_name}() method".format( count=count + (num_obj_rows*index), repr=obj, method_name=method_name ) if shortcut_data: method_to_run(shortcut_data) else: method_to_run() print u"finished {repr}.{method_name}(). took {elapsed}sec".format( repr=obj, method_name=method_name, elapsed=elapsed(start_time, 4) ) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail" db.session.remove() # close connection nicely return None # important for if we use this on RQ
def create_person(dirty_orcid, campaign=None): try: orcid_id = clean_orcid(dirty_orcid) except NoOrcidException: print u"\n\nWARNING: no valid orcid_id in {}; skipping\n\n".format(dirty_orcid) raise my_person = add_or_overwrite_person_from_orcid_id(orcid_id, high_priority=False) if campaign: my_person.campaign = campaign db.session.add(my_person) success = safe_commit(db) if not success: print u"ERROR! committing {}".format(my_person.orcid_id)
def just_add_twitter(filename, limit=None, create=True): with open("data/" + filename, "r") as f: lines = f.read().split("\n") print "found {} ORCID lines".format(len(lines)) if limit: lines = lines[:limit] total_start = time() for line in lines: loop_start = time() email = None twitter = None if "," in line: (dirty_orcid, email, twitter) = line.split(",") else: dirty_orcid = line if twitter: twitter = twitter.replace("@", "") try: orcid_id = clean_orcid(dirty_orcid) except NoOrcidException: try: print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(line) except UnicodeDecodeError: print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n" continue my_person = Person.query.filter_by(orcid_id=orcid_id).first() if my_person: my_person.twitter = twitter db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(orcid_id) print u"added twitter {} to {}".format(twitter, orcid_id) else: print u"no person found with id {}".format(orcid_id) print "loaded {} profiles in {}s\n".format(len(lines), elapsed(total_start))
def create_person(dirty_orcid, campaign=None, store_in_db=False): try: orcid_id = clean_orcid(dirty_orcid) except NoOrcidException: print u"\n\nWARNING: no valid orcid_id in {}; skipping\n\n".format(dirty_orcid) raise if store_in_db: print u"storing in db" my_person = make_person(orcid_id, store_in_db=True) if campaign: my_person.campaign = campaign db.session.add(my_person) success = safe_commit(db) if not success: print u"ERROR! committing {}".format(my_person.orcid_id) else: print u"NOT storing in db" my_person = make_person(orcid_id, store_in_db=False) print my_person
def add_or_overwrite_person_from_orcid_id(orcid_id, high_priority=False): # if one already there, use it and overwrite. else make a new one. my_person = Person.query.filter_by(orcid_id=orcid_id).first() if my_person: db.session.merge(my_person) print u"\nusing already made person for {}".format(orcid_id) else: # make a person with this orcid_id my_person = Person(orcid_id=orcid_id) db.session.add(my_person) print u"\nmade new person for {}".format(orcid_id) my_person.refresh(high_priority=high_priority) # now write to the db commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(my_person.orcid_id) return my_person
def refresh_fulltext(orcid_id): my_person = Person.query.filter_by(orcid_id=orcid_id).first() my_person.recalculate_openness() safe_commit(db) return json_resp(my_person.to_dict())
def get_unpaywall_events(self, rows=100): insights_client = geoip2.webservice.Client(os.getenv("MAXMIND_CLIENT_ID"), os.getenv("MAXMIND_API_KEY")) tar_gz_filename = "today-{}.tsv.gz".format(self.first_day) execute("rm {}".format(tar_gz_filename), check=False) # clear it if there is already one there command_template = """curl --no-include -o {} -L -H "X-Papertrail-Token: {}" https://papertrailapp.com/api/v1/archives/{}/download""" command = command_template.format(tar_gz_filename, os.getenv("PAPERTRAIL_API_KEY"), self.first_day) execute(command) if execute("ls -lh {}".format(tar_gz_filename), check=False): execute("zgrep [email protected] {} > unpaywall_events.txt".format(tar_gz_filename), capture=True, check=False) else: # no file. get the files for all the hours instead execute("rm unpaywall_events.txt", check=False) # clear it if there is already one there, because appending for hour in range(24): day_with_hour = "{}-{:02d}".format(self.first_day, hour) command = command_template.format(tar_gz_filename, os.getenv("PAPERTRAIL_API_KEY"), day_with_hour) execute(command) execute("zgrep [email protected] {} >> unpaywall_events.txt".format(tar_gz_filename), capture=True, check=False) # writing into database fh = open("unpaywall_events.txt", "r") if execute("ls -lh unpaywall_events.txt", check=False): num_this_loop = 0 for line in fh: #only keep lines that are the right kind of log lines if line and not (u"[email protected]" in line and u'\toadoi\t' in line and u'\theroku/router\t' in line and u'at=info method=GET path="/10' in line): continue columns = line.split("\t") collected = columns[1] if not collected.startswith("20"): # not a valid timestamp, skip this line continue # at=info method=GET path="/[email protected]" host=api.oadoi.org request_id=7ae3022c-0dcd-44b7-ae7e-a888d8843d4f fwd="70.666.777.999" dyno=web.6 connect=1ms service=40ms status=200 bytes=774 protocol=https \n try: doi = re.findall('path="/(.*)\[email protected]', line)[0] doi = doi.lower() id = re.findall('request_id=(.*?) ', line)[0] ip = re.findall('fwd="(.*)"', line)[0] except IndexError: # skip this line, it doesn't have a doi or ip or whatever, continue to next line continue # print collected, doi, ip, id unpaywall_obj = UnpaywallEvent(doi=doi, ip=ip, collected=collected) db.session.merge(unpaywall_obj) insights = IpInsights.query.filter(IpInsights.ip==ip).first() if not insights: try: response_insights = insights_client.insights(ip) except ValueError: # this is what it throws if bad ip address response_insights = None if response_insights: insight_dict = response_insights.raw for key in ["city", "country", "continent", "registered_country"]: if key in insight_dict and "names" in insight_dict[key]: insight_dict[key]["name"] = insight_dict[key]["names"]["en"] del insight_dict[key]["names"] for key in ["subdivisions"]: if key in insight_dict: my_list = [] for item in insight_dict[key]: if "names" in item: item["name"] = item["names"]["en"] del item["names"] my_list.append(item) insight_dict[key] = my_list insights = IpInsights(ip=ip, insights=insight_dict) db.session.merge(insights) num_this_loop += 1 if num_this_loop > rows: logger.info(u"committing") safe_commit(db) num_this_loop = 0 logger.info(u"done everything, saving last ones") safe_commit(db)
def run_through_dois(filename=None, reverse=None, loggly=False): total_start = time() i = 0 output_dicts = [] fh = open(filename, "r") lines = fh.readlines() if reverse: logger.info(u"reverse!") lines.reverse() i = -1 * len(lines) dois = [] for line in lines: dois.append(line.strip()) # line = line.replace('"', '') # if u"," in line: # split_line = line.split(",") # if loggly: # dois.append(split_line[1]) # else: # dois.append(split_line[0]) # else: # dois.append(line.strip()) # deduplicate, preserving order duplicated_dois = dois dois = [] for doi in duplicated_dois: if doi not in dois: dois.append(doi) logger.info(u"length of deduped doi list: {}".format(len(dois))) for doi in dois: try: my_doi = clean_doi(doi) except NoDoiException: logger.info(u"bad doi: {}".format(doi)) continue if not my_doi: logger.info(u"bad doi: {}".format(doi)) continue my_pub = Oab.query.get(my_doi) if not my_pub: my_pub = Oab() db.session.add(my_pub) my_pub.id = my_doi my_doi_url = "http://doi.org/{}".format(my_doi) my_doi_url_encoded = urllib.quote_plus(my_doi_url) api_url = "https://api.openaccessbutton.org/availability?url={}".format(my_doi_url_encoded) headers = {"content-type": "application/json"} r = requests.get(api_url, headers=headers) if r.status_code == 200: logger.info(u"success with oab! with {}".format(my_doi)) # logger.info(r.json()) my_pub.api = r.json() flag_modified(my_pub, "api") else: logger.info(u"problem with oab, status_code {}".format(r.status_code)) dissemin_url = "http://dissem.in/api/{}".format(my_doi) r = requests.get(dissemin_url, headers=headers) if r.status_code == 200: logger.info(u"success! with dissemin! with {}".format(my_doi)) # logger.info(r.json()) my_pub.dissemin = r.json() flag_modified(my_pub, "dissemin") else: logger.info(u"problem with dissemin, status_code {}".format(r.status_code)) safe_commit(db) i += 1 logger.info(u"finished {} in {} seconds".format(i, elapsed(total_start, 2))) fh.close()
def call_pmh_endpoint(self, first=None, last=None, chunk_size=50, scrape=False): start_time = time() records_to_save = [] num_records_updated = 0 loop_counter = 0 self.error = None (pmh_input_record, pmh_records, error) = self.get_pmh_input_record(first, last) if error: self.error = u"error in get_pmh_input_record: {}".format(error) return while pmh_input_record: loop_counter += 1 # create the record my_pmh_record = pmh_record.PmhRecord() # set its vars my_pmh_record.repo_id = self.id_old # delete once endpoint_ids are all populated my_pmh_record.endpoint_id = self.id my_pmh_record.rand = random() my_pmh_record.populate(pmh_input_record) if is_complete(my_pmh_record): my_pages = my_pmh_record.mint_pages() my_pmh_record.pages = my_pages # logger.info(u"made {} pages for id {}: {}".format(len(my_pages), my_pmh_record.id, [p.url for p in my_pages])) if scrape: for my_page in my_pages: my_page.scrape_if_matches_pub() records_to_save.append(my_pmh_record) db.session.merge(my_pmh_record) # logger.info(u"my_pmh_record {}".format(my_pmh_record)) else: logger.info(u"pmh record is not complete") # print my_pmh_record pass if len(records_to_save) >= chunk_size: num_records_updated += len(records_to_save) last_record = records_to_save[-1] # logger.info(u"last record saved: {} for {}".format(last_record.id, self.id)) safe_commit(db) records_to_save = [] if loop_counter % 100 == 0: logger.info(u"iterated through 100 more items, loop_counter={} for {}".format(loop_counter, self.id)) pmh_input_record = self.safe_get_next_record(pmh_records) # make sure to get the last ones if records_to_save: num_records_updated += len(records_to_save) last_record = records_to_save[-1] logger.info(u"saving {} last ones, last record saved: {} for {}, loop_counter={}".format( len(records_to_save), last_record.id, self.id, loop_counter)) safe_commit(db) else: logger.info(u"finished loop, but no records to save, loop_counter={}".format(loop_counter)) # if num_records_updated > 0: if True: logger.info(u"updated {} PMH records for endpoint_id={}, took {} seconds".format( num_records_updated, self.id, elapsed(start_time, 2)))
def set_person_claimed_at(my_person): my_person.claimed_at = datetime.datetime.utcnow().isoformat() db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(my_person.orcid_id)
def get_chorus_data(starting_offset=0, agency_id=None): requests_session = requests.Session() retries = Retry(total=10, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]) requests_session.mount('http://', DelayedAdapter(max_retries=retries)) requests_session.mount('https://', DelayedAdapter(max_retries=retries)) agencies = get_chorus_agencies() for agency in agencies: if agency_id: if int(agency["Agency_Id"]) != int(agency_id): print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"]) continue if starting_offset: offset = starting_offset else: offset = 0 logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"])) url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}" limit = 50 total_results = None while total_results==None or offset < total_results: loop_start = time() url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit) print url try: r = requests_session.get(url, timeout=360) # wait for 3 minutes except Exception, e: logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8"))) r = None print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1)) offset += limit if r: data = r.json() total_results = data["total_results"] logger.info(u"Has {} total results, {} remaining".format( total_results, total_results - offset)) items = data["items"] new_objects = [] for item in items: if item["DOI"]: doi = clean_doi(item["DOI"]) new_objects.append(Chorus(id=doi, raw=item)) ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()] objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db] if objects_to_add_to_db: logger.info(u"adding {} items".format(len(objects_to_add_to_db))) db.session.add_all(objects_to_add_to_db) safe_commit(db) else: logger.info(u"all of these items already in db") logger.info(u"sleeping for 2 seconds") sleep(2)