def run(self): """ Generates data and writes it into the :py:meth:`~.Streams.output` target. """ cur_year = self.yr cur_query = self.qr run_query = cur_query + ' AND ( PUBYEAR = ' + str(cur_year) + ') ' size = ScopusSearch(run_query, refresh=True, download=False).get_results_size() if size > 10000: print('scopus query with over 10k records running, careful') df = pd.DataFrame(ScopusSearch(run_query, refresh=True).results) fav_fields = [ 'eid', 'creator', 'doi', 'title', 'afid', 'affilname', 'author_count', 'author_names', 'author_afids', 'coverDate', 'coverDisplayDate', 'publicationName', 'issn', 'source_id', 'eIssn', 'citedby_count', 'fund_sponsor', 'aggregationType', 'openaccess', 'description', 'authkeywords' ] df = df[fav_fields] # cut fields # # 1X: drop all empty eids to prevent issues later (to be safe) df = df.dropna(axis=0, subset=['eid'], inplace=False) #print(len(df)) df.to_pickle(self.output().path) #, encoding='utf-8')
def robust_query(q, refresh=False, fields=("eid", "coverDate")): """Wrapper function for individual ScopusSearch query.""" try: s = ScopusSearch(q, refresh=refresh, integrity_fields=fields) res = s.results except (AttributeError, KeyError): res = ScopusSearch(q, refresh=True).results return res or []
def perform_query(auth_id, refresh=100, fields=["eid", "title"]): """Access ScopusSearch API to retrieve EIDs, sources and publication years. """ q = f"AU-ID({auth_id})" try: res = ScopusSearch(q, refresh=refresh, integrity_fields=fields).results info = parse_publications(res, auth_id) except (AttributeError, KeyError, TypeError): res = ScopusSearch(q, refresh=True).results info = parse_publications(res, auth_id) if not info: return None, None, None, None, None return zip(*info)
def test_find_location(): auth_id = 6701809842 pubs = ScopusSearch("AU-ID({})".format(auth_id)).results ctry, aid, aff = find_location([str(auth_id)], pubs, 2000, refresh=False) assert_equal(ctry, "Germany") assert_equal(aid, "60028717") assert_equal(aff, "University of Munich")
def test_retrieve_author_info_authoryear(): make_database(test_cache, drop=True) conn = connect_database(test_cache) # Variables table = "author_year" expected_auth = [53164702100, 57197093438] search_auth = [55317901900] year = 2016 df2 = pd.DataFrame(expected_auth + search_auth, columns=["auth_id"], dtype="int64") df2["year"] = year # Insert data fill = robust_join(expected_auth, sep=') OR AU-ID(') q = f"(AU-ID({fill})) AND PUBYEAR BEF {year+1}" d = build_dict(ScopusSearch(q, refresh=refresh).results, expected_auth) expected = pd.DataFrame.from_dict(d, orient="index", dtype="int64") expected = expected.sort_index().rename_axis('auth_id').reset_index() expected["year"] = year expected = expected[[ 'auth_id', 'year', 'first_year', 'n_pubs', 'n_coauth' ]] insert_data(expected, conn, table=table) # Retrieve data incache, missing = retrieve_author_info(df2, conn, table) assert_frame_equal(incache, expected) assert_equal(missing['auth_id'].tolist(), search_auth) assert_equal(missing['year'].tolist(), [year])
def create_obj(params): if q_type == "author": return AuthorSearch(**params) elif q_type == "docs": params["integrity_fields"] = fields params["view"] = view return ScopusSearch(**params)
def test_expand_affiliation(): pubs = ScopusSearch(f"AU-ID(6701809842)", refresh=refresh).results res = pd.DataFrame(pubs) res = expand_affiliation(res) assert_true(len(res) >= 180) expect_columns = ['source_id', 'author_ids', 'afid'] assert_equal(set(res.columns), set(expect_columns)) assert_true(any(res['author_ids'].str.contains(";"))) assert_true(all(isinstance(x, (int, float)) for x in res['afid'].unique()))
def test_expand_affiliation(): auth_id = 6701809842 pubs = ScopusSearch("AU-ID({})".format(auth_id)).results res = pd.DataFrame(pubs) res = expand_affiliation(res) assert_equal(len(res), 185) expect_columns = ['source_id', 'author_ids', 'afid'] assert_equal(set(res.columns.tolist()), set(expect_columns)) assert_true(any(res.author_ids.str.contains(";"))) assert_false(any(res.afid.str.contains(";")))
def refine_scopus(docs: DocumentSet, *, search_title=True ) -> Tuple[DocumentSet, DocumentSet]: """Attempt to fetch Scopus metadata for each document in the given set. Returns a tuple containing two sets: the documents available on Scopus and the remaining documents not found on Scopus. Documents are retrieved based on their identifier (DOI, Pubmed ID, or Scopus ID). Documents without a unique identifier are retrieved by performing a fuzzy search based on their title. This is not ideal and can lead to false positives (i.e., another document is found having the same title), thus it can be disabled if necessary. :param search_title: Flag to toggle searching by title. """ from pybliometrics.scopus import ScopusSearch def callback(doc): id = doc.id if isinstance(doc, ScopusDocument): return doc if doi := id.doi: try: return ScopusDocument.from_doi(doi) except Exception as e: logging.warn(f'no document found for DOI {doi}: {e}') return None title = canonical(id.title) if len(title) > 10 and search_title: query = f'TITLE({title})' response = ScopusSearch(query, view='STANDARD', download=False) nresults = response.get_results_size() if nresults > 0 and nresults < 10: response = ScopusSearch(query, view='STANDARD') for record in response.results or []: if canonical(record.title) == title: return ScopusDocument.from_eid(record.eid) return None
def execute_search(dump_name, query): """Execute a search on Scopus using Scopus Query Language and print brief results- Define Query in advance""" t.tic() res = ScopusSearch(query) query_res = pd.DataFrame(res.results) # Select name for pickle data query_res.to_pickle('./Scopus_dumps/' + dump_name + '.pkl') t.toc('Query and saving DataFrame took ')
def get_data_from_doi(self, doi, title): id = None affil = None pub_name = None pub_type = None # try: try: doi_doc = ScopusSearch(doi, subscriber=False) if 'pubmed-id' in doi_doc._json[0].keys(): id = doi_doc._json[0]["pubmed-id"] if 'affiliation' in doi_doc._json[0].keys(): affil = doi_doc._json[0]['affiliation'] pub_name = doi_doc._json[0]['prism:publicationName'] pub_type = doi_doc._json[0]['subtypeDescription'] except: print("failed with scopus") if id == None: doi_doc = FullDoc(doi=doi) if doi_doc.read(self.client): # print("doi_doc.title: ", doi_doc.title) doi_doc.write() pub_name = doi_doc.data['coredata']['prism:publicationName'] if 'pubType' in doi_doc.data['coredata'].keys(): pub_type = str(doi_doc.data['coredata']['pubType']).strip() else: print( "Read document failed. no id for doi {}. trying with title" .format(doi)) doi_doc = None # return doi, affil id = None if doi_doc == None or (not 'pubmed-id' in doi_doc._data.keys()): print("trying with title") # try with title Entrez.email = '*****@*****.**' if doi_doc == None: query = title else: query = doi_doc.title handle = Entrez.esearch(db='pubmed', retmode='xml', term=query) results = Entrez.read(handle) if int(results['Count']) > 0: id = results['IdList'] else: id = doi_doc._data['pubmed-id'] if id != None: return self.fetch_data_from_pubmed(id), affil, pub_name, pub_type else: print("no pubmed id found for doi {}".format(doi)) return doi, affil, pub_name, pub_type
def performSearch(self, searchWords): # Create Search-String # Searching in TITLE-ABStract-KEYwords is the default search mode on scopus searchString = 'TITLE-ABS-KEY(' for i, word in enumerate(searchWords): searchString = searchString + word if (i != len(searchWords)-1): searchString = searchString + ' AND ' #Last Item else: searchString = searchString + ')' self.searchResult = ScopusSearch(searchString) self.searchWords = searchWords self.storeResultsInDB()
def search_scopus(query: str, *, limit: int = None) -> DocumentSet: """ Submit the given query to the Scopus API. :param limit: Restrict results the first `limit` documents. """ from pybliometrics.scopus import ScopusSearch search = ScopusSearch(query, view='STANDARD') eids = list(search.get_eids()) docs = [] if limit is not None and len(eids) > limit: random.seed(0) random.shuffle(eids) eids = eids[:limit] for eid in progress_bar(eids): doc = ScopusDocument.from_eid(eid) docs.append(doc) return DocumentSet(docs)
def main(): # Read in journals = pd.read_csv(SOURCE_FILE, index_col=0, encoding="utf8") # Get article information print(">>> Querying publications for:") d = [] for idx, row in journals.iterrows(): print("...", idx) for year in range(YEARS[0], YEARS[1]+1): q = f'SOURCE-ID({row.source_id}) AND PUBYEAR IS {year}' s = ScopusSearch(q, refresh=30) for pub in s.results: if pub.subtype not in DOCTYPES: continue s = parse_abstract(pub) s["journal"] = row.Abbreviation d.append(s) print(f">>> Found {len(d):,} publications") # Turn to DataFrame df = pd.DataFrame.from_records(d) print(">>> Correcting some titles") repl = {"&": "&", "<sup>": "", "</sup>": "", "<inf>": "", "</inf>": ""} for old, new in repl.items(): df['title'] = df['title'].str.replace(old, new) df['simple_title'] = df['title'].apply(standardize).str.upper() df['top'] = df['journal'].isin(TOP_JOURNALS)*1 # Add citation counts of reprints to original paper print(">>> Dropping reprints and duplicates") df = df.sort_values(['simple_title', 'year']) grouped = df.groupby('simple_title') left = grouped[[c for c in df.columns if "cit" not in c]].first() right = grouped[[c for c in df.columns if "cit" in c]].sum(min_count=1) df = pd.concat([left, right], axis=1) # Write out print(f">>> Saving {df.shape[0]:,} observations") df.set_index('simple_title').to_csv(TARGET_FILE, encoding="utf8")
def test_author_year_in_cache(): create_cache(drop=True, file=test_cache) # Variables expected_auth = ["53164702100", "57197093438"] search_auth = ["55317901900"] year = 2016 # Test empty cache df1 = pd.DataFrame(expected_auth, columns=["auth_id"], dtype="int64") df1["year"] = year auth_y_incache, auth_y_search = author_year_in_cache(df1, file=test_cache) assert_frame_equal(auth_y_search, df1) assert_equal(len(auth_y_incache), 0) # Test partial retrieval fill = ') OR AU-ID('.join([str(a) for a in expected_auth]) q = "(AU-ID({})) AND PUBYEAR BEF {}".format(fill, year+1) res = build_dict(ScopusSearch(q).results, expected_auth) res = pd.DataFrame.from_dict(res, orient="index", dtype="int64") res["year"] = year cols = ["year", "first_year", "n_pubs", "n_coauth"] res = res[cols].reset_index().rename(columns={"index": "auth_id"}) cache_insert(res, table="author_year", file=test_cache) df2 = pd.DataFrame(expected_auth + search_auth, columns=["auth_id"], dtype="int64") df2["year"] = year auth_y_incache, auth_y_search = author_year_in_cache(df2, file=test_cache) expected_auth = [int(au) for au in expected_auth] search_auth = [int(au) for au in search_auth] assert_equal(sorted(auth_y_incache.auth_id.tolist()), expected_auth) assert_equal(auth_y_incache.year.tolist(), [year, year]) assert_equal(auth_y_search.auth_id.tolist(), search_auth) assert_equal(auth_y_search.year.tolist(), [year]) # Test full retrieval auth_year_incache, auth_year_search = author_year_in_cache(df1, file=test_cache) assert_equal(sorted(auth_year_incache.auth_id.tolist()), expected_auth) assert_equal(auth_year_incache.year.tolist(), [year, year]) assert_true(auth_year_search.empty)
def query_scopus_by_doi(doi, verbose=True): """ get crossref records by paper doi :param doi: (str) doi of a paper :param verbose: (bool) print diagnosis message or not :return: (dict) result from crossref api """ # goal scopus_results = None # query crossref query_results = ScopusSearch('DOI({})'.format(doi), max_entries=None, cursor=True) # filter out empty query results if query_results.results is not None: scopus_results = query_results.results[0]._asdict() else: warnings.warn( 'Empty result from scopus when searching doi: {}'.format(doi)) return scopus_results
from pybliometrics.scopus import ScopusSearch from pybliometrics.scopus import AbstractRetrieval from pybliometrics.scopus import AuthorRetrieval from lxml import etree as et import os import requests from config import dcmappings saf_root_directory = 'saf' science_direct_base_url = 'https://api.elsevier.com/content/article/doi/' apiKey = os.environ['SCOPUS_API_KEY'] scopus_search_string = os.environ['SCOPUS_SEARCH_STRING'] s = ScopusSearch(scopus_search_string, refresh=True, view='COMPLETE') print(s.get_results_size()) eids = s.get_eids() counter = 0 orcid_mapping = { 'schema': 'local', 'attributes': { 'element': 'contributor', 'qualifier': 'author_orcid_id' } } def GetOrcidFromScopusID(scopus_id): try:
def main(): s = ScopusSearch('ISSN ( 0022-3514 )') print(s.get_results_size())
# # harvester # warning: the code uses 'today' so you have to run it before midnight # # harvest from ScopusSearch everything from VU+VUMC of today # because the API has issues with direct today command we instead take entire month and isolate today # # prepare query VU_with_VUMC_affid = "( AF-ID(60001997) OR AF-ID(60008734) OR AF-ID(60029124) OR AF-ID(60012443) OR AF-ID(60109852) OR AF-ID(60026698) OR AF-ID(60013779) OR AF-ID(60032886) OR AF-ID(60000614) OR AF-ID(60030550) OR AF-ID(60013243) OR AF-ID(60026220))" my_query = VU_with_VUMC_affid + ' AND ' + "PUBDATETXT( " + get_today_for_pubdatetxt( ) + " )" # RECENT(1) is somehow very slow print(my_query) # # call the scopussearch API s = ScopusSearch(my_query, refresh=True, download=True) df = pd.DataFrame(s.results) # # filter to records of today today = get_today() df = df[df.coverDate == today] # # here is the result (may be empty on some days) ###df # pre-processing aspect # we need to add extra sources, clean it, rename columns and make it ready for push # this is a static copy, and you should migrate processing to pycharm and call that from here # Add info on year and month df = add_year_and_month(df, 'coverDate') # add info columns
df_dois = df_dois.dropna() bits = 10 # not dynamic yet stepsize = int(np.ceil(len(df_dois) / bits) + 1) df_total = pd.DataFrame() for cur_bit in np.arange(0, bits): print('-------') print(cur_bit) df_dois_CUR = df_dois.iloc[stepsize * cur_bit:stepsize * (cur_bit + 1), :] doi_list_cur = df_dois_CUR['DOI'].to_list() cur_query = "DOI( " + " ) OR DOI( ".join(doi_list_cur) + " ) " if len(df_dois_CUR) > 0: t0 = time.time() fullres = pd.DataFrame( ScopusSearch(cur_query, download=True, refresh=True).results) t1 = time.time() print(t1 - t0) df_total = df_total.append(fullres) # backmerge it first df_total = df_total.drop_duplicates(subset='doi') df_export = df_orig.merge(df_total, left_on='DOI', right_on='doi', how='left') df_export.to_csv(PATH_START_PERSONAL + 'arjan.csv') df_export.to_excel(PATH_START_PERSONAL + 'arjan.xlsx')
def get_scopus_arm( MY_YEARSET, start_path_with_slash, xpure_pack, df_in=None, # there is no df_in (!)) do_save=False): """ get_scopus_arm is a refactor of the legacy scopus code from jupyter careful: we also have a luigi-variant of this! so you should use the luigi variant whenever possible I made this here because I needed an exact copy for a quick sprint so what does it do? it harvests scopus and then enriches it with unpaywall, deals, etc it also renames columns and deletes columns Use the assumption that MY_YEARSET is always 3 years Once we get this in Luigi it will work better than arbitrary length sets because this is an ATOMIC split of work and works well concurrently luigi will always skip parts if they already exist you do have to put it well in luigi: this function will be 2 pipe-types type-1 will do 1 year only type-2 will combine 3 years only and that is all you need because the entire pure arm is for 1 chosen year but can be easily extended to do multiple chosen years efficiently """ xpure_pack # is not used right now, but OK dict_output = {} for MY_YEAR in MY_YEARSET: print(MY_YEAR) # settings # testing override_query_for_testing = False running_on_server = False # paths if running_on_server: path_deals = 'C:/Users/yasing/Desktop/oa oktober/apcdeals.csv' #check path_isn = 'C:/Users/yasing/Desktop/oa oktober/ISN_ISSN.csv' #check path_org = 'C:/Users/yasing/Desktop/oa oktober/vu_organogram_2.xlsx' #check path_out = start_path_with_slash #'C:/Users/yasing/Desktop/oa oktober/' #check path_vsnu_afids = 'C:/Users/yasing/Desktop/oa oktober/afids_vsnu_nonfin.csv' #check else: path_deals = r'G:\UBVU\Data_RI\raw data algemeen\apcdeals.csv' path_isn = r'G:\UBVU\Data_RI\raw data algemeen\ISN_ISSN.csv' path_org = r'G:\UBVU\Data_RI\raw data algemeen\vu_organogram_2.xlsx' path_out = start_path_with_slash #'C:/Users/yasin/Desktop/oa new csv/' # no r path_vsnu_afids = r'G:\UBVU\Data_RI\raw data algemeen\afids_vsnu_nonfin.csv' # scopus search and affiliation # # ! VUMC HAS BEEN ADDED ! # chosen_affid = [ "60008734", "60029124", "60012443", "60109852", "60026698", "60013779", "60032886", "60000614", "60030550", "60013243", "60026220", "60001997" ] # I added 60001997 and thus I added VUMC #VU_noMC_affid = "(AF-ID(60008734) OR AF-ID(60029124) OR AF-ID(60012443) OR AF-ID(60109852) OR AF-ID(60026698) OR AF-ID(60013779) OR AF-ID(60032886) OR AF-ID(60000614) OR AF-ID(60030550) OR AF-ID(60013243) OR AF-ID(60026220))" VU_with_VUMC_affid = "( AF-ID(60001997) OR AF-ID(60008734) OR AF-ID(60029124) OR AF-ID(60012443) OR AF-ID(60109852) OR AF-ID(60026698) OR AF-ID(60013779) OR AF-ID(60032886) OR AF-ID(60000614) OR AF-ID(60030550) OR AF-ID(60013243) OR AF-ID(60026220))" my_query = VU_with_VUMC_affid + ' AND ' + "( PUBYEAR = " + str( MY_YEAR) + " )" ### "PUBDATETXT(February 2018)" # TITLE(TENSOR) AND # corresponding author vu_afids = chosen_affid # this is vsnu w/o phtu and such (borrowed from VSNU-SDG-data), but should approach the UKB list... good for now. update later. all_vsnu_sdg_afids = pd.read_csv(path_vsnu_afids).iloc[:, 1].astype( 'str').to_list() # testing if override_query_for_testing: my_query = 'TITLE(TENSOR LPV)' print('overriding query for testing') # ETLMIG MIGRATION DONE # helper functions # ! CAREFUL! COPIED CODE def fn_cats(row): if row == 'closed': result = 1 elif row == 'hybrid': result = 2 elif row == 'bronze': result = 3 elif row == 'green': result = 4 elif row == 'gold': result = 5 else: result = 0 # nans etc return result # entire pipeline # Perform ScopusSearch s = ScopusSearch( my_query, refresh=True) #(VU_aff + " AND " + recent, refresh=True) df = pd.DataFrame(s.results) # Remove unnecessary columns fav_fields = [ 'eid', 'creator', 'doi', 'title', 'afid', 'affilname', 'author_count', 'author_names', 'author_afids', 'coverDate', 'coverDisplayDate', 'publicationName', 'issn', 'source_id', 'eIssn', 'citedby_count', 'fund_sponsor', 'aggregationType', 'openaccess' ] df = df[fav_fields] # cut fields # Add info on year and month df = add_year_and_month(df, 'coverDate') # add info columns # prepare the faculty_finder NLP tool org_info = pd.read_excel(path_org, skiprows=0) ff = faculty_finder(organizational_chart=org_info) # Per EID, get scopus abstract info, get first vu author and use NLP to find faculty # initialize df_ab = pd.DataFrame() df_au = pd.DataFrame() df_ff = pd.DataFrame() for counter, cur_eid in enumerate(df.eid.tolist()): print('getting abstract info for ' + str(counter + 1) + ' out of ' + str(len(df.eid.tolist()))) # get abstract dict_ab_info = get_scopus_abstract_info(cur_eid) # ! dict_ab_info['eid'] = cur_eid # get first chosen affiliation author dict_auth_info = get_first_chosen_affiliation_author( dict_ab_info['abstract_object'], chosen_affid) dict_auth_info['eid'] = cur_eid # get faculty if dict_auth_info['first_affil_author_has_error'] == True: print('no chosen affid author found at EID:' + str(cur_eid)) dict_ff = ff.match_nan() else: # get faculty dict_ff = ff.match(dict_auth_info['first_affil_author_org']) dict_ff['eid'] = cur_eid df_ab = df_ab.append(dict_ab_info, ignore_index=True) df_au = df_au.append(dict_auth_info, ignore_index=True) df_ff = df_ff.append(dict_ff, ignore_index=True) df = df.merge(df_ab, on='eid', how='left') df = df.merge(df_au, on='eid', how='left') df = df.merge(df_ff, on='eid', how='left') print('df_ab,au,ff done') #df.to_csv(r'C:\Users\yasing\Desktop\oa oktober\oa' + my_timestamp() + '.csv') # df.to_pickle(path_out + 'oa_base_' + my_timestamp() + str(MY_YEAR) + '.pkl') # add unpaywall info df = add_unpaywall_columns(df, silent=False) # ! # add deal info df = add_deal_info(path_deals=path_deals, path_isn=path_isn, df_b=df) # add corresponding author info df = (corresponding_author_functions().add_corresponding_author_info( df=df, vu_afids=vu_afids, ukb_afids=all_vsnu_sdg_afids)) # post-process df['upw_oa_color_category'] = df.upw_oa_color.apply(fn_cats) df['upw_oa_color_verbose'] = df['upw_oa_color'].apply( lambda x: 'unknown' if x is np.nan else x) # save it # save to pickle with abstract_object, for now # df.to_pickle(path_out + 'oa' + my_timestamp() + str(MY_YEAR) + '.pkl') # save to csv without abstract_object0 if do_save: df.drop(columns=['abstract_object']).to_csv(path_out + 'oa' + my_timestamp() + str(MY_YEAR) + '.csv') # diagnose # verval-analyse print('verval-analyse') print('aantal scopus publicaties: ' + str(len(df))) print('api error: abstract API: ' + str(len(df[df.abstract_error_message == 'abstract api error']))) print('api error: authgroup/afdelinginfo: ' + str(df.no_author_group_warning.sum())) # ab.authgroup error print('api error: authgroup.x/afdelinginfo details: ' + str(len(df[df.first_affil_author_has_error == True])) ) # ab.authgroup ok, error deeper in it print('api missing data: data afdelingsinfo ontbreekt no1: ' + str( len(df[(df.first_affil_author == None) & (df.first_affil_author_has_error == False)]))) print('api missing data: data afdelingsinfo ontbreekt no2: ' + str(len(df[df.first_affil_author_org == None]))) # pas hier heb je data om mee te werken print( 'no match: no faculty name match and bag of words only has trivial words (zoals lidwoorden en Amsterdam): ' + str( len(df[ df.ff_message == 'no faculty name match and bag of words only has trivial words'] ))) print( 'no match: no faculty name match and no bag of words match despite non-trivial words (vaak VUMC, soms typo): ' + str( len(df[ df.ff_message == 'no faculty name match and no bag of words match despite non-trivial words'] ))) print('aantal matches: ' + str(len(df[df.ff_score > 0]))) # diagnostics can be improved further by capturing the last 6 fails too # print done print('done') # extra: post-process ##df = pd.read_csv(r'C:\Users\yasin\Desktop\oa new csv\OA_VU2018_met_corresponding_authors.csv') ##list(df) # this also drop abstract_object(!) df2 = df[[ 'eid', 'doi', 'title', 'year', 'publicationName', 'issn', 'eIssn', 'fund_sponsor', 'aggregationType', 'first_affil_author', 'first_affil_author_org', 'ff_match', 'ff_match_subgroup', 'ff_message', 'ff_provided_organization_string', 'ff_score', 'ff_terms', 'upw_free_fulltext_url', 'upw_is_boai_license', 'upw_is_free_to_read', 'upw_is_subscription_journal', 'upw_license', 'upw_oa_color_category', 'upw_oa_color_verbose', 'upw_oa_color', # internal 'deal_name', 'deal_owner', 'deal_discount', 'deal_discount_verbose', 'deal_owner_verbose', 'corresponding_author_surname', 'match_affiliation_id', 'match_surname', 'match_indexed_name', 'match_auid', 'match_aut_score', 'is_corresponding_author_a_vu_author', 'is_corresponding_author_a_ukb_author' ]] col_rename_dict = { 'publicationName': 'journal_name', 'first_affil_author': 'first_VU_author', 'first_affil_author_org': 'first_VU_author_raw_organization_info', 'ff_match': 'faculty_(matched)', 'ff_match_subgroup': 'subgroup_(matched)', 'ff_message': 'diagnostics: ff message', 'ff_provided_organization_string': 'diagnostics: ff raw input ', 'ff_score': 'diagnostics: ff score', 'ff_terms': 'diagnostics: ff matching words', 'upw_free_fulltext_url': 'fulltext_free_url', 'upw_is_boai_license': 'is_a_boai_license', 'upw_is_free_to_read': 'is_free_to_read', 'upw_is_subscription_journal': 'is_a_subscription_journal', 'upw_license': 'license', #'upw_oa_color_category': '', # internal 'upw_oa_color_verbose': 'open_access_color', #'deal_name', 'deal_owner': 'deal_owner_raw', # 'deal_discount_verbose', # internal 'deal_owner_verbose': 'deal_scope', #'corresponding_author_surname', 'match_affiliation_id': 'corresponding_author_affiliation_id_(matched)', 'match_surname': 'corresponding_author_surname_(matched)', 'match_indexed_name': 'corresponding_author_indexed_name_(matched)', 'match_auid': 'corresponding_author_author_id_(matched)', 'match_aut_score': 'diagnostics:corresponding_author_match_score' } # 'is_corresponding_author_a_vu_author', # 'is_corresponding_author_a_ukb_author'} df2 = df2.rename(columns=col_rename_dict) def get_contact_point(row): if row.is_corresponding_author_a_vu_author is True: res = row['corresponding_author_indexed_name_(matched)'] else: res = row['first_VU_author'] # bij een workflow moet er even op PURE gekeken worden naar de huidige faculteit/groep van de auteur (evt hand/automatisch) return res df2['vu_contact_person'] = df2.apply(get_contact_point, axis=1) if do_save: df2.to_csv(path_out + 'knip_OA_VU' + str(MY_YEAR) + '_met_corresponding_authors.csv') df2.to_excel(path_out + 'knip_OA_VU' + str(MY_YEAR) + '_met_corresponding_authors.xlsx') dict_output[MY_YEAR] = df2 print('done with scopus arm') return dict_output
def query_scopus(query_str): print(query_str) s = ScopusSearch(query_str, refresh=1200) print(f"Obtained {s.get_results_size()} results") return s
# and use pip install XX, where XX is the package name and version # the packages required by this toolbox are in requirements.txt # and pip installing them one by one will finish this task # Alternatively, once available, a pipy package can be downloaded which # will do all these installations automatically. This will be released in the future. # from pybliometrics.scopus import ScopusSearch # knowledge of python packages is assumed # if that command failed, install the package pybliometrics # during your first run an api-key will be asked, get one from scopus # hint: in production code always put all imports at the top # # # now send out the query # easy querying of Scopus s = ScopusSearch(my_query) # the variable s stores the results # # next we turn it into a pandas dataframe for easy handling # we use s.results to make the wrapper return the results in suitable format # and pandas for data handling import pandas as pd df = pd.DataFrame(s.results) # # now the data can be printed print(df.head()) # # this wraps up how to get scopus dsta through python automatically # example 1: enriching your data with Altmetric # scopus is not everything
def get_piis(self, term_list, year_list, pii_path, config_path='/Users/nisarg/.scopus/config.ini', keymaster=False): """ This should be a standalone method that recieves a list of journals (issns), a keyword search, an output path and a path to clear the cache. It should be mappable to multiple parallel processes. """ fresh_keys = self.API_list journal_frame = self.make_jlist(jlist_url = 'https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls', \ journal_strings = ['chemistry','energy','molecular','atomic','chemical','biochem', \ 'organic','polymer','chemical engineering','biotech','colloid']) if pii_path[-1] is not '/': raise Exception('Output file path must end with /') if '.scopus/scopus_search' not in self.cache_path: raise Exception('Cache path is not a sub-directory of the scopus_search. Make sure cache path is correct.') # Two lists who's values correspond to each other issn_list = journal_frame['ISSN'].values journal_list = journal_frame['Journal_Title'].values # Find and replaces slashes and spaces in names for file storage purposes for j in range(len(journal_list)): if ':' in journal_list[j]: journal_list[j] = journal_list[j].replace(':','') elif '/' in journal_list[j]: journal_list[j] = journal_list[j].replace('/','_') elif ' ' in journal_list[j]: journal_list[j] = journal_list[j].replace(' ','_') # Build the dictionary that can be used to sequentially query elsevier for different journals and years query_dict = self.build_query_dict(term_list,issn_list,year_list) # Must write to memory, clear cache, and clear a dictionary upon starting every new journal for i in range(len(issn_list)): # At the start of every year, clear the standard output screen os.system('cls' if os.name == 'nt' else 'clear') paper_counter = 0 issn_dict = {} for j in range(len(year_list)): # for every year in every journal, query the keywords print(f'{journal_list[i]} in {year_list[j]}.') # Want the sole 'keymaster' process to handle 429 responses by swapping the key. if keymaster: try: query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]]) except Scopus429Error: print('entered scopus 429 error loop... replacing key') newkey = fresh_keys.pop(0) config["Authentication"]["APIKey"] = newkey time.sleep(5) query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]]) print('key swap worked!!') # If this process isn't the keymaster, try a query. # If it excepts, wait a few seconds for keymaster to replace key and try again. else: try: query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]]) except Scopus429Error: print('Non key master is sleeping for 15... ') time.sleep(15) query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]]) # at this point, the scopus 429 error should be fixed... print('Non key master slept, query has now worked.') # store relevant information from the results into a dictionary pertaining to that query year_dict = {} if query_results.results is not None: # some of the query results might be of type None for k in range(len(query_results.results)): paper_counter += 1 result_dict = {} result = query_results.results[k] result_dict['pii'] = result.pii result_dict['doi'] = result.doi result_dict['title'] = result.title result_dict['num_authors'] = result.author_count result_dict['authors'] = result.author_names result_dict['description'] = result.description result_dict['citation_count'] = result.citedby_count result_dict['keywords'] = result.authkeywords year_dict[k] = result_dict # Store all of the results for this year in the dictionary containing to a certain journal issn_dict[year_list[j]] = year_dict else: # if it was a None type, we will just store the empty dictionary as json issn_dict[year_list[j]] = year_dict # Store all of the results for this journal in a folder as json file os.mkdir(f'{pii_path}{journal_list[i]}') with open(f'{pii_path}{journal_list[i]}/{journal_list[i]}.json','w') as file: json.dump(issn_dict, file) with open(f'{pii_path}{journal_list[i]}/{journal_list[i]}.txt','w') as file2: file2.write(f'This file contains {paper_counter} publications.')
nodes=pd.DataFrame() nodes = nodes.append({"id":"", "title": ab.title, "sourcetitle": ab.sourcetitle_abbreviation, "publicationyear": ab.coverDate[0:4], "eid": ab.eid, "gen": '0' }, ignore_index=True) ref_df = pd.DataFrame(ab.references) ref_df["eid"] = '2-s2.0-' + ref_df['id'] ref_df['gen'] = '-1' ref_df2 = pd.concat([ref_df['eid'], ref_df['id'], ref_df['publicationyear'], ref_df['sourcetitle'], ref_df['title'], ref_df['gen']], axis=1, keys=['eid', 'id', 'publicationyear', 'sourcetitle', 'title', 'gen'], sort=True) #ref_df2 = ref_df2.drop(18) nodes = nodes.append(ref_df2, ignore_index = True, sort=True) for row in ref_df2.itertuples(): edges.append((row.eid, ab.eid)) len(nodes) s = ScopusSearch(ab.eid) for x in s.results: if(x.eid not in list(nodes['eid'])): nodes = nodes.append({"id":"", "title": x.title, "sourcetitle": "", "publicationyear": x.coverDate[0:4], "eid": x.eid, "gen": '1' }, ignore_index=True) print(x.title) edges.append((ab.eid, x.eid)) print(len(nodes)) for y in ab.references: try: refs = AbstractRetrieval(y.id, view="FULL") if(refs.references != None): ref_df = pd.DataFrame(refs.references)
def import_scopus(ctx, verbose, start): """ Import scopus publication records for the authors of the pubtrack application. This command will first fetch all the information about the authors, which are defined within the pubtrack app. It uses the scopus author ID's of these authors to send requests to the scopus database. The publications of these replies are then evaluated and posted into the pubtrack app. """ # SETTING UP PUBTRACK WRAPPER config = ctx.obj['config'] pubtrack = Pubtrack(config) # SETTING UP SCOPUS WRAPPER try: pybliometrics.scopus.utils.create_config() except FileExistsError: pass finally: scopus_config['Authentication']['APIKey'] = config.get_scopus_key() # FETCHING META AUTHOR INFORMATION FROM PUBTRACK click.secho('Fetching author information from pubtrack.') author_id_name_map = {} meta_authors = pubtrack.meta_author.get()['results'] for meta_author in meta_authors: for author in meta_author['authors']: # "author_name_kitopen" returns a string with the authors name. This function essentially formats the name # in a way so that it can be used in a query string for the KITOpen database. full_name = '{} {}'.format(author['first_name'], author['last_name']) scopus_id = author['scopus_id'] author_id_name_map[scopus_id] = full_name out( verbose, ' > Adding author "{} ({})" to be processed'.format( full_name, scopus_id)) click.secho('==> Processing total of {} authors'.format( len(author_id_name_map))) # QUERY SCOPUS DATABASE click.secho( 'Querying scopus database for the publications of those authors.') date_limit = datetime.datetime(year=start, month=1, day=1) for author_id, author_name in author_id_name_map.items(): publication_count = 0 search = ScopusSearch(f'AU-ID ( {author_id} )') out(verbose, ' | Query "AU-ID ( {} )"'.format(author_id)) for result in search.results: # We'll only take publications, which have a DOI if result.doi is None: continue # requesting the detailed information from the scopus database for the current publication from the search # results try: abstract_retrieval = AbstractRetrieval(result.doi) except Exception as e: out(verbose, ' # Could not retrieve publication "{}"'.format( result.doi), fg='yellow') continue # If the publication is older than the date limit, it will be discarded publication_date = datetime.datetime.strptime( abstract_retrieval.coverDate, '%Y-%m-%d') if publication_date <= date_limit: out(verbose, ' # Publication too old "{}"({})'.format( result.doi, publication_date), fg='yellow') continue else: out(verbose, ' > Fetched publication "{}"'.format(result.doi)) adapter = ScopusPublicationAdapter(abstract_retrieval) publication = adapter.get_publication() # Filtering the authors according to the AUTHOR_LIMIT, which has been set. # We cannot just use the first few authors however, we need to make sure that the author, from which we have # this publication in the first place is in there. The rest just gets filled up... authors = [] for author in publication['authors']: if author['scopus_id'] in author_id_name_map.keys( ) or len(authors) < config.get_author_limit(): authors.append(author) publication['authors'] = authors # Now we try to actually POST the publication to the pubtrack REST API try: pubtrack.import_publication(publication) publication_count += 1 out(verbose, ' * Added to pubtrack: "{}"'.format( publication['title']), fg='green') except Exception as e: if str(e) == 'uuid': out(verbose, ' ! Error while posting to pubtrack: Already exists!', fg='red') else: out(verbose, ' ! Error while posting to pubtrack: {}'.format( str(e)), fg='red') continue out(True, ' --> Total of {} publications imported from author {}'.format( publication_count, author_id), fg='green', bold=True)
from collections import namedtuple from nose.tools import assert_equal, assert_true from pybliometrics.scopus import ScopusSearch order = 'eid doi pii pubmed_id title subtype creator afid affilname '\ 'affiliation_city affiliation_country author_count author_names '\ 'author_ids author_afids coverDate coverDisplayDate publicationName '\ 'issn source_id eIssn aggregationType volume issueIdentifier '\ 'article_number pageRange description authkeywords citedby_count '\ 'openaccess fund_acr fund_no fund_sponsor' doc = namedtuple('Document', order) # Set to refresh=False because of citation count s_au = ScopusSearch('AU-ID(24320488600)', refresh=False) s_j = ScopusSearch('SOURCE-ID(22900) AND PUBYEAR IS 2010', refresh=False) q_empty = 'SOURCE-ID(19700188323) AND PUBYEAR IS 1900' s_empty = ScopusSearch(q_empty, refresh=False) def test_get_eids_author(): assert_equal(s_au.get_eids(), ['2-s2.0-26444452434']) def test_get_eids_journal(): assert_equal(len(s_j.get_eids()), 118) def test_get_results_size(): assert_equal(s_au.get_results_size(), 1)
] # <-- modifique aqui adicionando sua chave de acesso config["Authentication"]["APIKey"] = _keys.pop() api_view = "META" # Descomente a próxima linha para configurar a chave de acesso durante a primeira execução do programa. # create_config() # Configurando os critérios de pesquisa. query = 'TITLE-ABS-KEY("protected area" OR "conservation" OR "ecology" OR "marine protected" OR "national forest")' \ ' AND TITLE-ABS-KEY("remote sensing" OR "earth observation" OR "Landsat" OR "Lidar" OR "MODIS" OR "Radar")' \ ' AND TITLE-ABS-KEY("Brazil" OR "Brasil")' \ ' AND PUBYEAR BEF 2021 AND PUBYEAR AFT 1999' \ ' AND LANGUAGE(english OR portuguese)' # Cria um objeto de pesquisa ScopusSearch contendo as informações para busca. scopus = ScopusSearch(query, max_entries=None, subscriber=False, verbose=True) # Retorna o número de registros coletados pela API. print("Número total de publicações: {}.".format(scopus.get_results_size())) # Obtêm uma lista contendo todos os identificadores digitais (EID) resgatados da API durante a busca. eids_documentos = scopus.get_eids() # Coleta as informações sobre os artigos, a partir dos EID e da função auxiliar. df = coletar_artigos(eids_documentos, api_view) # Armazena todas as entradas em um arquivo .csv, para consulta posterior df.to_csv("data/resultado_pesquisa_scopus.csv", index=False, quoting=csv.QUOTE_ALL) """-------------------------------------------------------------
def search_scopus(query, docs=None, retrieve_orcid=True): """Search Scopus.""" documents = [] authors_cache = {} affiliations_cache = {} try: retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids() except ScopusQueryError: print("Impossible to process query \"{}\".".format(query)) return None if len(retrieved_paper_ids) == 0: print("No matching documents for the provided query.") return None for paper_id in tqdm(retrieved_paper_ids): try: paper = AbstractRetrieval(paper_id, view="FULL") except ValueError: print("Impossible to retrieve data for paper \"{}\".".format(paper_id)) return None doc_id = DocumentID() doc_id.parse_scopus(paper) authors = [] if paper.authors: for author in paper.authors: author_affiliations = [] if retrieve_orcid: if author.auid in authors_cache: authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors.append(Author(name=author.indexed_name, orcid=None, affiliations=author_affiliations)) if author.affiliation: for affiliation_id in author.affiliation: if affiliation_id in affiliations_cache: affiliation = affiliations_cache[affiliation_id] else: try: affiliation = ContentAffiliationRetrieval(affiliation_id) affiliations_cache[affiliation_id] = affiliation except: affiliation= None if affiliation: author_affiliations.append(Affiliation(name=affiliation.affiliation_name, city=affiliation.city, country=affiliation.country)) references = [] if paper.refcount and int(paper.refcount) > 0 and paper.references: for reference in paper.references: if reference.title: references.append(reference.title) if paper.language: try: language = iso639.languages.get(part2b=paper.language).name except KeyError: language = None else: language = None document = Document(id=doc_id, title=paper.title, keywords=paper.authkeywords, abstract=paper.description, source=paper.publicationName, source_type=paper.aggregationType, language=language, year=int(paper.coverDate.split("-")[0]), authors=authors, references=references, publisher=paper.publisher, internal=paper) if paper.citedby_count: document.citation_count = int(paper.citedby_count) documents.append(document) if docs: return DocumentSet(docs=documents).union(docs) else: return DocumentSet(docs=documents)
AUTHORS = {} meta_authors = pubtrack.meta_author.get()['results'] for meta_author in meta_authors: for author in meta_author['authors']: if author['scopus_id']: full_name = '{} {}'.format(author['first_name'], author['last_name']) AUTHORS[author['scopus_id']] = full_name logger.info(' * Adding author {}({}) to be processed'.format(full_name, author['scopus_id'])) logger.info('==> Processing total of {} authors'.format(len(AUTHORS))) DATE_LIMIT = datetime.datetime(year=SINCE, month=1, day=1) for author_id, full_name in AUTHORS.items(): publication_count = 0 search = ScopusSearch(f'AU-ID ( {author_id} )') logger.info('STARTING SEARCH FOR AUTHOR {}({})'.format(full_name, author_id)) for result in search.results: # We'll only take publications, which have a DOI if result.doi is None: continue # Requesting the detailed information from the scopus database for the current publication from the search # results try: abstract_retrieval = AbstractRetrieval(result.doi) logger.info(' * FETCHED publication {}'.format(result.doi)) except Exception as e: logger.error(' ! Could not retrieve scopus abstract for DOI "{}". ERROR: {}'.format(result.doi, str(e)))