def search(author, years, results, affiliation=None): try: client = WosClient(user_id, password) client.connect() except suds.WebFault as e: print_('Username and/or password not valid, or requests limit exceeded') print_(e) exit(1) # Build the tree tree = query(client, author, years, results, affiliation) # Extract information from the tree results = tree_extractor(tree) # Draw the table _draw_table(results) return results
def main(): global sResultFlag timestring = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') connIn = sqlite3.connect('paper2.db') dfSourceTitles = pandas.read_sql('select * from paper2', connIn) dfScopus = pandas.read_pickle('pickles/scopusData.pkl') connOut = sqlite3.connect('publications' + timestring + '.db') cursorOut = connOut.cursor() createTables(cursorOut) loopIdx = 0 publicationCount = len(dfSourceTitles.index) while loopIdx < publicationCount: sid = input('Please put in SID of Web of Science session: ') with WosClient(SID=sid, lite=True, close_on_exit=False) as client: for i in range(0, 2400): if loopIdx >= publicationCount: break sResultFlag = SearchResult.NOTFOUND publication = Publication( title=dfSourceTitles.loc[loopIdx]['title'], journal=dfSourceTitles.loc[loopIdx]['journal'], abstract=dfSourceTitles.loc[loopIdx]['abstract']) publication = searchPublicationInfoWos(publication, client) if sResultFlag == SearchResult.NOTFOUND: publication = searchPublicationInfoDblp(publication) if sResultFlag == SearchResult.NOTFOUND: publication = searchPublicationInfoScopus( publication, dfScopus, dfSourceTitles.iloc[loopIdx]['index']) if sResultFlag == SearchResult.MULTI: with open('logMulti' + timestring + '.txt', 'a', encoding='utf-8') as f: f.write('mulitple records found: "' + publication.title + '"\n') sResultFlag == None if sResultFlag == SearchResult.NOTFOUND: with open('logNotFound' + timestring + '.txt', 'a', encoding='utf-8') as f: f.write('Not found: "' + publication.title + '"\n') sResultFlag == None cursorOut.execute( "INSERT INTO publication VALUES(?,?,?,?,?,?)", (int(dfSourceTitles.loc[loopIdx]['id']), publication.title, publication.journal, publication.abstract, str(publication.authors), publication.year)) connOut.commit() loopIdx += 1 print('iteration: ' + str(loopIdx))
def fetchJson(self, user): with WosClient(user='******', password='******') as client: filename = (wos.utils.query(client, 'AU=%s'), user) # file = "../../../recordForAbrahms" infile = open(filename) contents = infile.read() Json = self.convertXMLToJson(contents) return json.loads(Json)
def queryWos(WOS, start_time): root_list = [] with WosClient(c.getUserName(), c.getPassword()) as client: print("Starting to look for title and publisher info") for i, id in enumerate(WOS): try: root = wos.utils.query_by_id(client, id) root_list.append(root) except: print("Some error while adding roots. Waiting 30 sec") time.sleep(15) if (i+1) % 50 == 0: print("Queries so far is {}. Time used is {:0.1f}".format((i+1),((time.time()-start_time)/60.0))) time.sleep(0.5) return root_list
def wos_main(querylist, d_path): ''' main function for calling web of science returns ''' # @TODO incorporate: article types, subject areas, countries. with WosClient() as client: for item in query_list: # this reformats the string from query_list to make it amenable new_item = " AND ".join(['TS=' + x.replace('{', '"'). replace('}', '"') for x in item.split(' AND ')]) now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") print(dt_string + ': searching WOS: ' + new_item) new_wos_query(new_item, client, d_path) time.sleep(60) df = merge_dfs(d_path) df.to_csv(os.path.join(d_path, 'wos', 'wos_search.csv'), index=False) return df
def queryRelatedWos(WOS, start_time): root_list = [] with WosClient(c.getUserName(), c.getPassword()) as client: print("Starting to look for related records") for i, id in enumerate(WOS): try: records = wos.utils.get_related_records_v2(client, id, count=2) roots = records.findall("REC") for root in roots: root_list.append(root) except: print("Some error while adding roots. Waiting 30 sec") time.sleep(5) if (i + 1) % 50 == 0: print("Queries so far is {}. Time used is {:0.1f}".format( (i + 1), ((time.time() - start_time) / 60.0))) time.sleep(0.5) return root_list
def queryWoS(titles, years): # Create an empty list which should contain info later info = [] # Connect to Web of Science with WosClient(c.getUserName(), c.getPassword()) as client: # Looping through the titles (search parameter) for i,title in enumerate(titles): # Replace '|' with ',' title = title.replace("|","") # Get year published year = years[i] # Create year query with +/- 1 year query_string_year = 'PY=(' + str(year-1) + ' OR ' + str(year) + ' OR ' + str(year+1) + ')' # Check if the title contains any operators (like AND, OR, NOT) if isOperator(title): title = removeOperator(title) # Create title query query_string_title = 'TI=' + title # Create query AND operator string query_AND_operator = ' AND ' # Create the query string query_string = query_string_title + query_AND_operator + query_string_year print query_string # Perform the query on wos engine xmlString = "" try: xmlString = wos.utils.query(client, query_string, count=1) except: print("Some error occurred while querying the WoS database") print xmlString # Convert to XML object if xmlString is not "": root = ElementTree.XML(xmlString) # Convert XML object to a dictionary xmlDict = XmlDictConfig(root) # Add the dictionary to the info list info.append(xmlDict) # Just for being 'nice' to WoS and not bomb attack the server time.sleep(1) return info
""" Garbage and convoluted code attempting to parse the formless REF data using web of science. """ import glob import difflib import json from wos import WosClient import wos.utils from xmljson import badgerfish as bf refs = {} errors = 0 total_cit = 0 with WosClient() as client: for infile in glob.glob("stage1/*.json"): total_cit += 1 with open(infile, "r") as infile_data: data = json.load(infile_data) if data["valid"] is False: continue all_Z = set() all_found = True for cit in data["citations"]: if cit["valid"] is False: all_found = False break
'pubyear': summary.find('pub_info')['pubyear'] } # In[ ]: # total_batch, nth_batch = 8, 0 total_batch, nth_batch = int(sys.argv[1]), int(sys.argv[2]) print(f'Now batch querying {nth_batch} of total {total_batch} batches') batch_size = len(all_uids) // total_batch batch_uids = all_uids[batch_size * nth_batch:batch_size * (nth_batch + 1)] # In[ ]: summary_client = WosClient() refs_client = WosClient() inv_refs_client = WosClient() summary_client.connect() refs_client.connect() inv_refs_client.connect() start_time = time.time() for idx, wos_code in enumerate(batch_uids, 1): if idx % 150 == 0: print( f'Has downloaded info of {idx} articles, Time elapsed: {time.time() - start_time: .4f}' ) start_time = time.time()
def queryWos(tcp_data, start, start_sample, end_sample): # Return a small list of titles, year and ids for testing #with open("data/wos_not_meta_data.json", "r") as f: # small_list_ids = json.load(f) # Create an empty list which should contain info later info = [] not_found = [] # Connect to Web of Science with WosClient(c.getUserName(), c.getPassword()) as client: print("Starting the queries") # Looping through the titles (search parameter) for i, id in enumerate(tcp_data.index.values.tolist()[start_sample:end_sample]): # Replace '|' with ',' title = tcp_data.loc[id].Title.replace("|",",").replace("?","").replace('"', '').replace("/"," ").replace("-", " ").replace(":", " ") title = re.sub(r'\([^)]*\)', '', title) title = title.replace("(","").replace(")","") # Get year published year = tcp_data.loc[id].Year # Create year query with +/- 1 year query_string_year = 'PY=(' + str(year-1) + ' OR ' + str(year) + ' OR ' + str(year+1) + ')' # Check if the title contains any operators (like AND, OR, NOT) if isOperator(title): title = removeOperator(title) # Create title query query_string_title = 'TI=' + title # Create query AND operator string query_AND_operator = ' AND ' # Create the query string query_string = query_string_title + query_AND_operator + query_string_year #print query_string # Perform the query on wos engine root = None wait = 2 try: root = wos.utils.query_v2(client, query_string, count=1) except suds.WebFault: print "Suds.WebFault: Waiting {} sec".format(wait) print suds.WebFault.args time.sleep(wait) except: print "Some other error occured, sleep {} sec".format(wait) time.sleep(wait) if root is None: # Adding tuple with id and title not_found.append(id) print("Did not find record with title {}".format(title)) print("Not found length is {}".format(len(not_found))) else: # Adding dictionary tcp_data_title = tcp_data.loc[id].Title.replace("|", ",") tcp_data_title = re.sub(r'\([^)]*\)', '', tcp_data_title) tcp_data_title = removePunct(tcp_data_title) wos_title = getTitle(root) wos_title = re.sub(r'\([^)]*\)', '', wos_title) wos_title = removePunct(wos_title) print("tcp title: {}".format(tcp_data_title)) print("wos title: {}".format(wos_title)) print if difflib.SequenceMatcher(None, tcp_data_title, wos_title).ratio() > 0.95: if getAbstract(root) is not None: info.append((root, id)) print("Successfully retrieved is now {}".format(len(info))) else: print print("Abstract was none") print else: print("titles not alike...") not_found.append(id) print("Number of queries so far is {}. Time used is {:0.1f} minutes".format((i+1),((time.time()-start)/60.0))) time.sleep(0.5) return info, not_found
from wos import WosClient import wos.utils with WosClient("", '') as client: print((wos.utils.query(client, 'AU=Goncalves, Renata AND PY=2017'))) # print((wos.utils.query(client, 'TI=(GRONS: a comprehensive genetic resource of nicotine and smoking)'))) # print((wos.utils.query(client, 'DOI=BUSH KAREN AND PY=(2015 AND 2016)')))
def wos_fetch(output_file, username, password): with WosClient(username, password) as client: print(wos.utils.query(client, 'AU=Beckner Wesley'))
# In[11]: import re crawled_authors = glob.glob('data/author_to_articles/*.json') crawled_authors = set( [os.path.basename(filename).split('.')[0] for filename in crawled_authors]) print( f'There Are {len(crawled_authors)} Collected Authors. Remain {len(authors) - len(crawled_authors)} Authors.' ) remained_authors = [ author for author in authors if author[1] not in crawled_authors ][::-1] wc = WosClient() wc.connect() for author_order, (author_name, author_id) in enumerate(remained_authors, 1): print( f'Finding Articles of Author {author_order}: {author_name} ({author_id})' ) if author_order % 100 == 0: wc.close() wc = WosClient() wc.connect() query_url = "https://apps.webofknowledge.com/" + f"InboundService.do?product=WOS&daisIds={author_id}" + "&Func=Frame&DestFail=http%3A%2F%2Fwww.webofknowledge.com" + f"&SrcApp=RRC&locale=zh_TW&SrcAuth=RRCPubList&SID={wc._SID}" + "&customersID=RRCPubList&mode=SourceByDais&IsProductCode=Yes&Init=Yes&viewType=summary&action=search" + "&action=changePageSize&pageSize=50" wos_codes = [] while True: