def download(save_folder, issn, date_start, date_end, api_key=None, tdm_token=None, arbitrary_xml_dl=False, filewriter=None): cr = Crossref() x = cr.journals(ids=issn, works=True, filter={'has_full_text': True, 'type': 'journal-article', 'from-pub-date': date_start, 'until-pub-date': date_end}, sort='issued', cursor='*', # mailto='*****@*****.**', limit=20) count = 0 articles_count = 0 total_results = x[0]['message'].get('total-results') total_with_pdfs = 0 total_with_xmls = 0 total_unspecified = 0 total_downloaded = 0 print('total results >>>>>>>>> ', total_results) while articles_count < total_results: for i in x[count]['message']['items']: articles_count += 1 print('\tArticle {0} out of {1}'.format(articles_count, total_results)) title = (re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0]))[:85]) # title is sometimes too long # so cut it to max 85chars filename_xml = save_folder + title response_data = i['link'] has_xml=False xml_links = set() unspecified_link=None if arbitrary_xml_dl: xml_links.add(response_data[0]['URL']) else: for d in response_data: content_type = d['content-type'] intended=d['intended-application'] link=d['URL'] if 'xml' in content_type or 'html' in content_type or 'xml' in link or 'html' in link: #print('found xml file <<<<<<<<< {}'.format(d)) total_with_xmls += 1 xml_links.add(link) # add download link to the list has_xml=True if 'pdf' in str(content_type): #print('found pdf file <<<<<<<<< {}'.format(d)) total_with_pdfs += 1 if 'unspecified' in str(content_type) and not has_xml and intended=='text-mining': #print('found unspecified type | POSSIBLE HTML <<<<<<<<< {}'.format(d)) #xml_links.add(link) total_unspecified += 1 if not has_xml: print("\t\twarning, no xml data for {}, trying to save it as html".format(xml_links)) # download files from the list made above if len(xml_links) > 0: downloaded = False set(xml_links) for xml in xml_links: #print('----------processing xml link: {}'.format(xml)) if downloaded: break #response_xml = requests.get((i['link'][0]['URL']) + '&apiKey=e873ab508be6a1e93c4ba6217c155ad4') # for LISR use the api key to get the full if api_key is not None: response_xml = requests.get(xml + api_key) elif tdm_token is not None: params={'CR-Clickthrough-Client-Token':tdm_token} response_xml=requests.get(xml, params) else: response_xml=requests.get(xml) # for jdoc it works without the key #response_xml = requests.get(xml) if response_xml.ok: if has_xml: filename_xml += '.xml' # add termination to the file name (location+filename+ext) else: filename_xml+='.html' with open(filename_xml, 'wb+') as file: file.write(response_xml.content) downloaded = True if response_xml.content is not None: # with open(filename_xml, 'r',encoding='utf8') as file: # if file.read() is not None: total_downloaded += 1 else: #save the link print(response_xml.status_code) filewriter.write(xml + "\n") count += 1 # the limit of cr.journals is set to 20 # so the api returns results in batches of 20 # increase the count to get to next batch in x[count]['message']['items'] print("Extraction Completed, total={}".format(total_results)) print( "Total with pdf={}, xml={}, unspecified={}, downloaded={}". format(total_with_pdfs, total_with_xmls, total_unspecified,total_downloaded)) return total_results, total_with_pdfs, total_with_xmls, total_downloaded, total_unspecified
from habanero import Crossref from pprint import pprint from graph_tool.all import * cr = Crossref(mailto = "*****@*****.**") n = 50 journal_ISSN = "1471-2105" journal_papers = cr.journals(ids = journal_ISSN, works = True, limit = n) def print_paper_info(journal_papers): for paper in journal_papers["message"]["items"]: print("Title: " + paper["title"][0]) print("DOI: " + paper["DOI"]) first_author = None for author in paper["author"]: if author["sequence"] == "first": first_author = author["given"] + " " + author["family"] break print("First Author: " + first_author) journal_graph = Graph() vertex_id = journal_graph.new_vertex_property("string") journal_root = journal_graph.add_vertex() vertex_id[journal_root] = journal_ISSN for paper in journal_papers["message"]["items"]: paper_vertex = journal_graph.add_vertex() vertex_id[paper_vertex] = paper["DOI"] journal_graph.add_edge(paper_vertex, journal_root)
def download(save_folder, issn, date_start, date_end): cr = Crossref() x = cr.journals( ids=issn, works=True, filter={ 'has_full_text': True, 'type': 'journal-article', 'from-pub-date': date_start, 'until-pub-date': date_end }, sort='issued', cursor='*', #mailto='*****@*****.**', limit=20) count = 0 articles_count = 0 my_list = [] total_results = x[0]['message'].get('total-results') total_with_pdfs = 0 total_with_xmls = 0 total_xml_accessible = 0 print('total results >>>>>>>>> ', total_results) while (articles_count < total_results): for i in x[count]['message']['items']: # get the title and remove any special characters, keep spaces # title = re.sub('[^A-Za-z0-9]+', '', str(x['message']['items'][count]['title'][0])[:85]) articles_count += 1 print("Article ", articles_count, " out of ", total_results) # print(title) # print(count) # print(articles_count) total_articles = len(i.get('title')[0]) title = (re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0]))[:85]) filename_xml = save_folder + title # filename_txt = "E:exported_data\\" + title + ".txt" # at index 0, the api returns nothing, so file 'empty' is just a placeholder # if count == 0: # filename_txt = "empty" # filename_xml = "empty" response_data = i['link'] pdf_counted = False xml_counted = False xml_links = set() for d in response_data: content_type = d['content-type'] if 'pdf' in content_type and not pdf_counted: total_with_pdfs += 1 pdf_counted = True if 'xml' in content_type or 'html' in content_type: # or 'unspecified' in content_type: response_xml = d['URL'] xml_links.add(response_xml) if not xml_counted: total_with_xmls += 1 xml_counted = True # response_xml = requests.get(i['link'][1]['URL']) # response_plain = requests.get((i['link'][1]['URL']) + '&apiKey=e873ab508be6a1e93c4ba6217c155ad4') # save the content from the xml link into a .xml file if len(xml_links) > 0: downloaded = False for xml in xml_links: if downloaded: break response_xml = requests.get(xml) if (response_xml.ok): with open((filename_xml), 'wb+') as file: # print(i['link'][1]['URL']) file.write(response_xml.content) downloaded = True total_xml_accessible += 1 # save the content from the text/plain link into a .txt file # with open((filename_txt), 'wb') as file: # if (response_plain.ok): # # print(i['link'][1]['URL']) # file.write(response_plain.content) count += 1 print("Extraction Completed, total={}".format(total_results)) print("Total with pdf={}, xml={}, downloadable xml={}".format( total_with_pdfs, total_with_xmls, total_xml_accessible)) return total_results, total_with_pdfs, total_with_xmls, total_xml_accessible
conn = sqlite3.connect("../ezproxy-DOI.db") sqlite_cursor = conn.cursor() journals = Journals() works = Works() sqlite_cursor.execute("SELECT doi FROM ezproxy_doi WHERE doi IS NOT NULL") DOIs = sqlite_cursor.fetchall() #print(DOIs) # a = cr.journals() for item in DOIs: DOI = item[0] print("running..." + DOI) try: journals = cr.journals(ids = cr.works(ids=DOI)["message"]["ISSN"]) if "message" in journals: pass elif type(journals) == type(list()): journals = journals[0] else: continue #print(journals) print("Journal Title: " + journals["message"]["title"]) print("Subjects: " + str(journals["message"]["subjects"])) except HTTPError: print("HTTPError") continue
def download(save_folder, issn, date_start, date_end): cr = Crossref() x = cr.journals( ids=issn, works=True, filter={ 'has_full_text': True, 'type': 'journal-article', 'from-pub-date': date_start, 'until-pub-date': date_end }, sort='issued', cursor='*', #mailto='*****@*****.**', limit=20) count = 0 articles_count = 0 total_results = x[0]['message'].get('total-results') total_with_pdfs = 0 total_with_xmls = 0 total_unspecified = 0 total_xml_accessible = 0 print('total results >>>>>>>>> ', total_results) while articles_count < total_results: for i in x[count]['message']['items']: articles_count += 1 print('\nArticle {0} out of {1}'.format(articles_count, total_results)) title = (re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0]))[:85] ) #title is sometimes too long #so cut it to max 85chars filename_xml = save_folder + title response_data = i['link'] pdf_counted = False xml_counted = False xml_links = set() for d in response_data: content_type = d['content-type'] if 'xml' in content_type: print('found xml file <<<<<<<<< {}'.format(d)) total_with_xmls += 1 response_xml = d['URL'] xml_links.add( response_xml) # add download link to the list if 'pdf' in str(content_type): print('found pdf file <<<<<<<<< {}'.format(d)) total_with_pdfs += 1 if 'unspecified' in str(content_type): print( 'found unspecified type | POSSIBLE HTML <<<<<<<<< {}'. format(d)) total_unspecified += 1 # download files from the list made above if len(xml_links) > 0: downloaded = False set(xml_links) for xml in xml_links: print('----------processing xml link: {}'.format(xml)) if downloaded: break # for LISR use the api key to get the full text # response_xml = requests.get(xml+'&apiKey=e873ab508be6a1e93c4ba6217c155ad4') # for jdoc it works without the key response_xml = requests.get(xml) if response_xml.ok: filename_xml += '.xml' # add termination to the file name (location+filename+ext) with open(filename_xml, 'wb+') as file: file.write(response_xml.content) downloaded = True if response_xml.content is not None: # with open(filename_xml, 'r',encoding='utf8') as file: # if file.read() is not None: total_xml_accessible += 1 count += 1 # the limit of cr.journals is set to 20 # so the api returns results in batches of 20 # increase the count to get to next batch in x[count]['message']['items'] print("Extraction Completed, total={}".format(total_results)) print("Total with pdf={}, xml={}, downloadable xml={}".format( total_with_pdfs, total_with_xmls, total_xml_accessible)) return total_results, total_with_pdfs, total_with_xmls, total_xml_accessible
def get_journal_papers(ISSN): cr = Crossref(mailto = "*****@*****.**") cr.journals()
# x = cr.journals(ids = "0740-8188", works =True, # filter = {'has_full_text':True, 'type': 'journal-article', 'from-pub-date':'2008-01-01', 'until-pub-date':'2018-01-01'}, # sort='issued', # cursor='*', # cursor_max=1000, # limit='200') # details of a doi # y = cn.content_negotiation(ids="10.1002/asi.24193") # DONE SO FAR: 2008-01-01 -> 2010-01-01#, '2010-01-01 -> 2012-01-01, 2012-01-01 -> 2014-01-01, 2014-01-01 -> 2016-01-01, 2016-01-01 -> 2018-01-01 x = cr.journals(ids="2330-1643", works=True, filter={ 'has_full_text': True, 'type': 'journal-article', 'from-pub-date': '2016-01-01', 'until-pub-date': '2019-01-01' }, sort='issued', cursor='*', limit=20) count = 0 articles_count = 0 my_list = [] total_results = x[0]['message'].get('total-results') print('total results >>>>>>>>> ', total_results) while (articles_count < total_results): for i in x[count]['message']['items']: # get the title and remove any special characters, keep spaces
import os.path import re works = Works() journals = Journals() cr = Crossref() # ISSN for JASIS -> issn=2330-1643 # ISSN for JDOC -> issn=0022-0418 # ISSN for LISR -> issn=0740-8188 x = cr.journals(ids="0740-8188", works=True, filter={ 'has_full_text': True, 'type': 'journal-article', 'from-pub-date': '2008-01-01', 'until-pub-date': '2008-03-03' }, sort='issued', cursor='*', limit=20) # details of a doi # y = cn.content_negotiation(ids="10.1002/asi.24193") count = 0 for i in x['message']['items']: # get the title and remove any special characters, keep spaces # title = re.sub('[^A-Za-z0-9]+', '', str(x['message']['items'][count]['title'][0])[:85]) title = re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0])) filename_xml = "E:exported_data\\" + title + ".xml"