Exemplo n.º 1
0
def download(save_folder, issn, date_start, date_end,
             api_key=None, tdm_token=None,
             arbitrary_xml_dl=False, filewriter=None):

    cr = Crossref()

    x = cr.journals(ids=issn, works=True,
                    filter={'has_full_text': True, 'type': 'journal-article',
                            'from-pub-date': date_start, 'until-pub-date': date_end},
                    sort='issued',
                    cursor='*',
                    # mailto='*****@*****.**',
                    limit=20)

    count = 0
    articles_count = 0
    total_results = x[0]['message'].get('total-results')
    total_with_pdfs = 0
    total_with_xmls = 0
    total_unspecified = 0
    total_downloaded = 0
    print('total results >>>>>>>>> ', total_results)

    while articles_count < total_results:
        for i in x[count]['message']['items']:
            articles_count += 1
            print('\tArticle {0} out of {1}'.format(articles_count, total_results))

            title = (re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0]))[:85])  # title is sometimes too long
            # so cut it to max 85chars
            filename_xml = save_folder + title
            response_data = i['link']
            has_xml=False
            xml_links = set()

            unspecified_link=None
            if arbitrary_xml_dl:
                xml_links.add(response_data[0]['URL'])

            else:
                for d in response_data:
                    content_type = d['content-type']
                    intended=d['intended-application']
                    link=d['URL']
                    if 'xml' in content_type or 'html' in content_type or 'xml' in link or 'html' in link:
                        #print('found xml file <<<<<<<<< {}'.format(d))
                        total_with_xmls += 1
                        xml_links.add(link)  # add download link to the list
                        has_xml=True
                    if 'pdf' in str(content_type):
                        #print('found pdf file <<<<<<<<< {}'.format(d))
                        total_with_pdfs += 1
                    if 'unspecified' in str(content_type) and not has_xml and intended=='text-mining':
                        #print('found unspecified type | POSSIBLE HTML <<<<<<<<< {}'.format(d))
                        #xml_links.add(link)
                        total_unspecified += 1
            if not has_xml:
                print("\t\twarning, no xml data for {}, trying to save it as html".format(xml_links))

            # download files from the list made above
            if len(xml_links) > 0:
                downloaded = False

                set(xml_links)
                for xml in xml_links:
                    #print('----------processing xml link: {}'.format(xml))
                    if downloaded:
                        break

                    #response_xml = requests.get((i['link'][0]['URL']) + '&apiKey=e873ab508be6a1e93c4ba6217c155ad4')

                    # for LISR use the api key to get the full
                    if api_key is not None:
                        response_xml = requests.get(xml + api_key)
                    elif tdm_token is not None:
                        params={'CR-Clickthrough-Client-Token':tdm_token}
                        response_xml=requests.get(xml, params)
                    else:
                        response_xml=requests.get(xml)

                    # for jdoc it works without the key
                    #response_xml = requests.get(xml)

                    if response_xml.ok:
                        if has_xml:
                            filename_xml += '.xml'  # add termination to the file name (location+filename+ext)
                        else:
                            filename_xml+='.html'
                        with open(filename_xml, 'wb+') as file:
                            file.write(response_xml.content)
                            downloaded = True
                            if response_xml.content is not None:
                                # with open(filename_xml, 'r',encoding='utf8') as file:
                                #     if file.read() is not None:
                                total_downloaded += 1
                    else:
                        #save the link
                        print(response_xml.status_code)
                        filewriter.write(xml + "\n")
        count += 1
        # the limit of cr.journals is set to 20
        # so the api returns results in batches of 20
        # increase the count to get to next batch in x[count]['message']['items']

    print("Extraction Completed, total={}".format(total_results))
    print(
        "Total with pdf={}, xml={}, unspecified={}, downloaded={}".
            format(total_with_pdfs, total_with_xmls, total_unspecified,total_downloaded))

    return total_results, total_with_pdfs, total_with_xmls, total_downloaded, total_unspecified
Exemplo n.º 2
0
from habanero import Crossref
from pprint import pprint
from graph_tool.all import *

cr = Crossref(mailto = "*****@*****.**")

n = 50
journal_ISSN = "1471-2105"
journal_papers = cr.journals(ids = journal_ISSN, works = True, limit = n)

def print_paper_info(journal_papers):
    for paper in journal_papers["message"]["items"]:
        print("Title: " + paper["title"][0])
        print("DOI: " + paper["DOI"])
        first_author = None
        for author in paper["author"]:
            if author["sequence"] == "first":
                first_author = author["given"] + " " + author["family"]
                break
        print("First Author: " + first_author)

journal_graph = Graph()
vertex_id = journal_graph.new_vertex_property("string")

journal_root = journal_graph.add_vertex()
vertex_id[journal_root] = journal_ISSN

for paper in journal_papers["message"]["items"]:
    paper_vertex = journal_graph.add_vertex()
    vertex_id[paper_vertex] = paper["DOI"]
    journal_graph.add_edge(paper_vertex, journal_root)
Exemplo n.º 3
0
def download(save_folder, issn, date_start, date_end):
    cr = Crossref()

    x = cr.journals(
        ids=issn,
        works=True,
        filter={
            'has_full_text': True,
            'type': 'journal-article',
            'from-pub-date': date_start,
            'until-pub-date': date_end
        },
        sort='issued',
        cursor='*',
        #mailto='*****@*****.**',
        limit=20)

    count = 0
    articles_count = 0
    my_list = []
    total_results = x[0]['message'].get('total-results')
    total_with_pdfs = 0
    total_with_xmls = 0
    total_xml_accessible = 0
    print('total results >>>>>>>>> ', total_results)

    while (articles_count < total_results):
        for i in x[count]['message']['items']:
            # get the title and remove any special characters, keep spaces
            # title = re.sub('[^A-Za-z0-9]+', '', str(x['message']['items'][count]['title'][0])[:85])
            articles_count += 1
            print("Article ", articles_count, " out of ", total_results)
            # print(title)
            # print(count)
            # print(articles_count)
            total_articles = len(i.get('title')[0])

            title = (re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0]))[:85])
            filename_xml = save_folder + title
            # filename_txt = "E:exported_data\\" + title + ".txt"

            # at index 0, the api returns nothing, so file 'empty' is just a placeholder
            # if count == 0:
            #     filename_txt = "empty"
            #     filename_xml = "empty"
            response_data = i['link']
            pdf_counted = False
            xml_counted = False
            xml_links = set()
            for d in response_data:
                content_type = d['content-type']
                if 'pdf' in content_type and not pdf_counted:
                    total_with_pdfs += 1
                    pdf_counted = True
                if 'xml' in content_type or 'html' in content_type:  # or 'unspecified' in content_type:
                    response_xml = d['URL']
                    xml_links.add(response_xml)
                    if not xml_counted:
                        total_with_xmls += 1
                        xml_counted = True

            # response_xml = requests.get(i['link'][1]['URL'])

            # response_plain = requests.get((i['link'][1]['URL']) + '&apiKey=e873ab508be6a1e93c4ba6217c155ad4')

            # save the content from the xml link into a .xml file
            if len(xml_links) > 0:
                downloaded = False

                for xml in xml_links:
                    if downloaded:
                        break

                    response_xml = requests.get(xml)
                    if (response_xml.ok):
                        with open((filename_xml), 'wb+') as file:
                            # print(i['link'][1]['URL'])
                            file.write(response_xml.content)
                            downloaded = True
                            total_xml_accessible += 1
            # save the content from the text/plain link into a .txt file
            # with open((filename_txt), 'wb') as file:
            #     if (response_plain.ok):
            #         # print(i['link'][1]['URL'])
            #         file.write(response_plain.content)
        count += 1
    print("Extraction Completed, total={}".format(total_results))
    print("Total with pdf={}, xml={}, downloadable xml={}".format(
        total_with_pdfs, total_with_xmls, total_xml_accessible))

    return total_results, total_with_pdfs, total_with_xmls, total_xml_accessible
Exemplo n.º 4
0
conn = sqlite3.connect("../ezproxy-DOI.db")
sqlite_cursor = conn.cursor()

journals = Journals()
works = Works()

sqlite_cursor.execute("SELECT doi FROM ezproxy_doi WHERE doi IS NOT NULL")
DOIs = sqlite_cursor.fetchall()

#print(DOIs)

# a = cr.journals()

for item in DOIs:
	DOI = item[0]
	print("running..." + DOI)
	try:
		journals = cr.journals(ids = cr.works(ids=DOI)["message"]["ISSN"])
		if "message" in journals:
			pass
		elif type(journals) == type(list()):
			journals = journals[0]
		else:
			continue
		#print(journals)
		print("Journal Title: " + journals["message"]["title"])
		print("Subjects: " + str(journals["message"]["subjects"]))
	except HTTPError:
		print("HTTPError")
		continue
Exemplo n.º 5
0
def download(save_folder, issn, date_start, date_end):
    cr = Crossref()

    x = cr.journals(
        ids=issn,
        works=True,
        filter={
            'has_full_text': True,
            'type': 'journal-article',
            'from-pub-date': date_start,
            'until-pub-date': date_end
        },
        sort='issued',
        cursor='*',
        #mailto='*****@*****.**',
        limit=20)

    count = 0
    articles_count = 0
    total_results = x[0]['message'].get('total-results')
    total_with_pdfs = 0
    total_with_xmls = 0
    total_unspecified = 0
    total_xml_accessible = 0
    print('total results >>>>>>>>> ', total_results)

    while articles_count < total_results:
        for i in x[count]['message']['items']:
            articles_count += 1
            print('\nArticle {0} out of {1}'.format(articles_count,
                                                    total_results))

            title = (re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0]))[:85]
                     )  #title is sometimes too long
            #so cut it to max 85chars
            filename_xml = save_folder + title
            response_data = i['link']
            pdf_counted = False
            xml_counted = False
            xml_links = set()

            for d in response_data:
                content_type = d['content-type']
                if 'xml' in content_type:
                    print('found xml file <<<<<<<<< {}'.format(d))
                    total_with_xmls += 1
                    response_xml = d['URL']
                    xml_links.add(
                        response_xml)  # add download link to the list
                if 'pdf' in str(content_type):
                    print('found pdf file <<<<<<<<< {}'.format(d))
                    total_with_pdfs += 1
                if 'unspecified' in str(content_type):
                    print(
                        'found unspecified type | POSSIBLE HTML <<<<<<<<< {}'.
                        format(d))
                    total_unspecified += 1

            # download files from the list made above
            if len(xml_links) > 0:
                downloaded = False

                set(xml_links)
                for xml in xml_links:
                    print('----------processing xml link: {}'.format(xml))
                    if downloaded:
                        break

                    # for LISR use the api key to get the full text
                    # response_xml = requests.get(xml+'&apiKey=e873ab508be6a1e93c4ba6217c155ad4')

                    # for jdoc it works without the key
                    response_xml = requests.get(xml)

                    if response_xml.ok:
                        filename_xml += '.xml'  # add termination to the file name (location+filename+ext)
                        with open(filename_xml, 'wb+') as file:
                            file.write(response_xml.content)
                            downloaded = True
                            if response_xml.content is not None:
                                # with open(filename_xml, 'r',encoding='utf8') as file:
                                #     if file.read() is not None:
                                total_xml_accessible += 1
        count += 1
        # the limit of cr.journals is set to 20
        # so the api returns results in batches of 20
        # increase the count to get to next batch in x[count]['message']['items']

    print("Extraction Completed, total={}".format(total_results))
    print("Total with pdf={}, xml={}, downloadable xml={}".format(
        total_with_pdfs, total_with_xmls, total_xml_accessible))

    return total_results, total_with_pdfs, total_with_xmls, total_xml_accessible
Exemplo n.º 6
0
def get_journal_papers(ISSN):
	cr = Crossref(mailto = "*****@*****.**")
	cr.journals()
Exemplo n.º 7
0
# x = cr.journals(ids = "0740-8188", works =True,
#                 filter = {'has_full_text':True, 'type': 'journal-article', 'from-pub-date':'2008-01-01', 'until-pub-date':'2018-01-01'},
#                 sort='issued',
#                 cursor='*',
#                 cursor_max=1000,
#                 limit='200')
# details of a doi
# y = cn.content_negotiation(ids="10.1002/asi.24193")
# DONE SO FAR: 2008-01-01 -> 2010-01-01#, '2010-01-01 -> 2012-01-01, 2012-01-01 -> 2014-01-01, 2014-01-01 -> 2016-01-01, 2016-01-01 -> 2018-01-01

x = cr.journals(ids="2330-1643",
                works=True,
                filter={
                    'has_full_text': True,
                    'type': 'journal-article',
                    'from-pub-date': '2016-01-01',
                    'until-pub-date': '2019-01-01'
                },
                sort='issued',
                cursor='*',
                limit=20)

count = 0
articles_count = 0
my_list = []
total_results = x[0]['message'].get('total-results')
print('total results >>>>>>>>> ', total_results)

while (articles_count < total_results):
    for i in x[count]['message']['items']:
        # get the title and remove any special characters, keep spaces
Exemplo n.º 8
0
import os.path
import re

works = Works()
journals = Journals()
cr = Crossref()

# ISSN for JASIS -> issn=2330-1643
# ISSN for JDOC -> issn=0022-0418
# ISSN for LISR -> issn=0740-8188
x = cr.journals(ids="0740-8188",
                works=True,
                filter={
                    'has_full_text': True,
                    'type': 'journal-article',
                    'from-pub-date': '2008-01-01',
                    'until-pub-date': '2008-03-03'
                },
                sort='issued',
                cursor='*',
                limit=20)
# details of a doi
# y = cn.content_negotiation(ids="10.1002/asi.24193")
count = 0

for i in x['message']['items']:

    # get the title and remove any special characters, keep spaces
    # title = re.sub('[^A-Za-z0-9]+', '', str(x['message']['items'][count]['title'][0])[:85])
    title = re.sub('[^A-Za-z0-9]+', ' ', (i.get('title')[0]))
    filename_xml = "E:exported_data\\" + title + ".xml"