예제 #1
0
def crossref_api(login_data, sql_dataframe, e_mail, chunk_size):
    pd.options.mode.chained_assignment = None
    match_info = sql_dataframe
    sampledata = match_info[(match_info["match_id"] == "not_match") |
                            (match_info["match_id"] == "error")]
    sampledata["crossref"] = np.nan
    cr = Crossref(mailto=e_mail)
    i = 0

    while True:
        ns = i + chunk_size
        if i < len(sampledata):
            dict_cross = []

            for index, row in sampledata[i:ns].iterrows():
                tempdata = []
                reftext = sampledata.ix[index]["ref_text"]
                try:
                    x = cr.works(
                        query=reftext,
                        limit=1,
                        select=
                        "DOI,title,issued,short-container-title,ISSN,score,URL,title,page,publisher,container-title,DOI,author,volume,issued"
                    )
                    tempdata.append(row[0])
                    tempdata.append(x["message"]["items"])
                except:
                    tempdata.append(row[0])
                    tempdata.append(np.nan)
                    print("error" + str(index))
                dict_cross.append(tempdata)
            crossref_to_db(login_data, dict_cross)
            i = i + chunk_size
        else:
            break
예제 #2
0
def fetch_doi_from_crossref(item):
    """ link titles with dois """
    cr = Crossref()

    # goes thru all the papers and checks via crossref
    try:
        query = '"' + item["title"] + '"'\
        + " " + flatten(item["authors"])
    except TypeError:
        # No author information available (probably)
        query = '"' + item["title"] + '"'

    print(STD_INFO + query)
    server_reached = False
    while server_reached == False:
        try:
            query_result = cr.works(query=query, limit=3)
            server_reached = True
        except:
            #HTTPError (Service Unavailable)
            print(STD_WARNING +
                  "CrossRef server unavailable. Retry in 5 seconds")
            time.sleep(5)

    try:
        title = query_result['message']['items'][0]['title'][0]
    except KeyError:
        title = 'None'

    doi = query_result['message']['items'][0]['DOI']
    return doi, title
예제 #3
0
def names_from_xref(doi):
    """
    Get the first names of the first and last authors for a given DOI.

    Inputs
    ------
    doi : string
        The DOI of the paper whose first and last author names you want to know. Here, it's usually a citing paper.

    Outputs
    -------
    first_author : string
        The first name of the first author of the given paper.

    last_author : string
        The first name of the last author of the given paper.
    """
    cr = Crossref()
    title = ""
    works = cr.works(
        query=title, select=["DOI", "author"], limit=1, filter={"doi": doi}
    )
    if works["message"]["total-results"] > 0:
        item = works["message"]["items"][0]
        if "author" in item.keys():
            first_author = get_name_from_author_dict(item["author"][0])
            last_author = get_name_from_author_dict(item["author"][-1])
        else:
            first_author = ""
            last_author = ""
    return first_author, last_author
예제 #4
0
def get_metadata_from_title(title):
    """
    Fetch the metadata for a document given its title and return a crossref item dict.
    :param title: title of document
    :return: crossref item dictionary
    """
    cr = Crossref()

    x = cr.works(query=title)

    top_items = x['message']['items'][:N_TOP_ITEMS]

    # rank top titles in order of string similarity to queried title

    def match(item):

        return SequenceMatcher(None, item['title'][0], title).ratio()

    top_item = sorted(top_items, key=match)[-1]

    # attempt to find abstract

    top_item['abstract'] = get_abstract(top_item)

    return top_item
예제 #5
0
def CrossRefAPIfunc(keyPhrase):
    cr = Crossref()
    x = cr.works(query=keyPhrase)

    if x["message"]["total-results"] > 0:
        x = x['message']["items"][0]
        date = x["indexed"]["date-parts"][0]
        #referenceCount = x["is-referenced-by-count"]
        title = x["title"][0]

        if "author" in x:
            authors = x["author"][0]["given"] + x["author"][0]["family"]
            if len(x["author"]) > 1:
                authors += " et al."
        else:
            authors = None
        url = x["URL"]
        #score = x["score"]
    else:
        date = None
        #referenceCount = None
        title = None
        author = None
        url = None
        #score = -1
    return (url, title, authors, date)
예제 #6
0
def lookup_data(
    doi: str = None,
    in_wikipedia: bool = False,
):  # -> Dict[str, str]:
    """Lookup data and return Dict"""
    # https://www.crossref.org/education/retrieve-metadata/rest-api/
    # async client here https://github.com/izihawa/aiocrossref but only 1 contributor
    # https://github.com/sckott/habanero >6 contributors not async
    if doi is None:
        print("Error. Got None instead of DOI. Report this error please.")
    else:
        print("Looking up from Crossref")
        cr = Crossref()
        #result = cr.works(doi=doi)
        result = cr.works(ids=doi)
        # print(result.keys())
        message = result["message"]
        object_type = message["type"]
        if object_type == "book":
            print("Book detected, we exclude those for now.")
            return None
        #print(message.keys())
        data = extract_data(message, in_wikipedia)
        print(data)
        if data.get("publisher") and data.get("publisher_location"):
            # TODO look up publisher via sparqldataframe
            print("Found both publisher and location")
예제 #7
0
    def __init__(self, doi):
        cr = Crossref()
        try:
            message = cr.works(doi)['message']
        except:
            message = None

        if message:
            metadata = {}

            metadata['doi'] = doi

            journal_info = {
                x['name']: x['value']
                for x in message['assertion']
            }
            metadata.update(journal_info)

            metadata['page'] = message['page'].replace('-', '--')
            metadata['volume'] = message['volume']
            metadata['author'] = ' and '.join(
                [f"{x['given']} {x['family']}" for x in message['author']])

            ts = message['license'][0]['start']['date-time']
            metadata['date'] = datetime.strptime(ts,
                                                 '%Y-%m-%dT%H:%M:%SZ').date()

            metadata[
                'article_name'] = f"{message['author'][0]['family']}_{metadata['date'].year}"

            self.metadata = metadata
예제 #8
0
def pre_save_article(sender, instance, **kwargs):
    cr = Crossref()
    article_meta = cr.works(ids=instance.DOI)
    instance.DOI = instance.DOI.strip()
    instance.title = get_title(article_meta)
    instance.description = get_description(article_meta)
    instance.keywords = get_keywords(article_meta)
    instance.article_url = get_url(article_meta)
예제 #9
0
def get_papers(issn=ISSN,
               offset=0,
               per_page=PER_PAGE,
               username=HABANERO_USERNAME):
    crossref_api = Crossref(mailto=username)
    return crossref_api.works(filter={"issn": issn},
                              offset=offset,
                              limit=per_page)  # get a first set of papers
def doi_valid(value):
    try:
        cr = Crossref(mailto="*****@*****.**"
                      )  # necessary to end up in the polite pool
        work = cr.works(ids=value)
        work['message']['title']
    except Exception as e:
        raise ValidationError(f"Invalid DOI: {e}")
예제 #11
0
    def __init__(self):

        # constants
        self.TRIALS = 3

        # init
        self.sh = SciHub()
        self.cr = Crossref()
예제 #12
0
 def __init__(self, doi=None, parms=None):
     cr = Crossref()
     if doi:
         self.raw_data = cr.works(ids=doi,format="json")
         self.record = self.raw_data['message']
     elif parms:
         self.raw_data = cr.works(filter=parms, cursor="*", limit=500,format="json")
         self.record = None
         self.record_list = []
예제 #13
0
파일: litdb.py 프로젝트: scotthartley/litdb
def get_doi(dois, config):
    """Retrieve an article by doi.
    """
    crossref = Crossref(mailto=config['settings']['email'])
    cr_result = crossref.works(ids=dois)
    if len(dois) == 1:
        return DB_dict.parse_cr([cr_result['message']])
    else:
        return DB_dict.parse_cr([c['message'] for c in cr_result])
예제 #14
0
def title2doi(title):
    title = title.lower()
    clean_title = ''.join(e for e in title if e.isalnum())
    cr = Crossref()
    res = cr.works(query_title=title, select="title,DOI", limit=5)
    for r in res['message']['items']:
        fetched_title = r['title'][0].lower()
        clean_fetched = ''.join(e for e in fetched_title if e.isalnum())
        if clean_fetched == clean_title:
            return r['DOI']
예제 #15
0
def crossref(doi):
    cr = Crossref(mailto="*****@*****.**")
    response = cr.works(ids=doi)
    # habanero returns a list if doi is a list of len > 1
    # otherwise a single dict
    if isinstance(doi, (list, tuple, set)) and len(doi) > 1:
        D = [parse_crossref(i) for i in response]
        return {x.pop("doi"): x for x in D}
    else:
        return parse_crossref(response)
예제 #16
0
def get_crossref_metadata(title, path):
    """
    Gets Crossref metadata, given an article's title. Then puts the metadata on the clipboard
    :param title: Title to search for
    :param path: PDF-Path, not necessary
    """

    print "getting crossref"

    # Searches the Crossref API for the given title, gets best result
    cr = Crossref()
    query = cr.works(query=title, limit=1)

    doi = ''

    # Extract DOI out of Crossref answer
    for item in query['message']['items']:
        doi = item['DOI']

    # Not used, but useful. Gets metadata from isbnlib, given DOI
    # print isbnlib.doi2tex(doi)

    # Gets APA citation, given DOI
    apa_citation = cn.content_negotiation(ids=doi, format="text", style="apa")

    # We could get more formats this way, but this is not used at the moment, better performance without getting these formats
    # rdf_citation = cn.content_negotiation(ids=doi, format="rdf-xml")
    # json_citation = cn.content_negotiation(ids=doi, format="citeproc-json")
    # bib_entry = cn.content_negotiation(ids=doi, format="bibentry")

    # Prettify APA citation
    apa_citation = prettify_UTF8_Strings(apa_citation).strip('\n')
    print apa_citation

    clp.OpenClipboard(None)
    citations = {}
    citations['APA'] = apa_citation
    try:
        citations['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT),
                                       errors='replace')
    except:
        citations['content'] = 'no text content available'
    # Puts the citations on the clipboard
    clp.SetClipboardData(citation_format, json.dumps(citations))

    sources = {}
    sources['source'] = path
    try:
        sources['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT),
                                     errors='replace')
    except:
        sources['content'] = 'no text content available'
    # Puts the sources on the clipboard
    clp.SetClipboardData(src_format, json.dumps(sources))
    clp.CloseClipboard()
예제 #17
0
def title_from_DOI(DOI):
    cr = Crossref(mailto=crossref_email)
    try:
        response = cr.works(ids=DOI)
    except HTTPError:
        return None
    if "message" in response:
        if "title" in response["message"]:
            if response["message"]["title"]:
                return response["message"]["title"][0]
    return None
예제 #18
0
파일: litdb.py 프로젝트: scotthartley/litdb
def update_from_cr(config):
    """Retrieve records from Crossref.
    """
    crossref = Crossref(mailto=config['settings']['email'])
    orcid_ids = [n for n in config['authors']]
    cr_results = crossref.works(filter={
        'orcid': orcid_ids,
        'type': [config['settings']['article_type']]
    },
                                sort=config['settings']['sort_field'],
                                order=config['settings']['order'],
                                limit=config['settings']['num_records'])
    return DB_dict.parse_cr(cr_results['message']['items'])
예제 #19
0
def enrich_from_crossref(bib, email):
    cr = Crossref()
    if email:
        Crossref(mailto=email)
    okcount = 0
    print("entries where no exact matching entry could be found on Crossref:")
    for entry in bib.entries:
        res = cr.works(query_bibliographic=entry["title"])
        item = get_matching_item(entry, res['message']['items'])
        if item:
            okcount += 1
            enrich_entry(entry, item)
    print(okcount, "of", len(bib.entries), "had matching titles")
예제 #20
0
def make_references(publications, output_dir):
    """
    Create reference bib file
    Args:
        publications: the list of publications
        output_dir: the output directory

    Returns:
        A list of reference identifiers
    """
    log = Logger()
    cr = Crossref()
    lines = []
    references = []

    for i, publication in enumerate(publications):
        log.notice(
            f"Querying and formatting {i + 1} out of {len(publications)} publications"
        )
        link = publication[LINK]
        title = publication[TITLE]

        # Check if it is a DOI url
        if link and "doi.org" in link:
            doi = urlparse(link).path.strip("/")

        # Extract the DOI using the title
        else:
            results = cr.works(query_bibliographic=title, limit=1)
            if (results["message"]["total-results"] == 0
                    or results["message"]["items"][0]["title"][0].lower() !=
                    title.lower()):
                log.warn(f'Could not find the doi for "{title}"')

                continue

            doi = results["message"]["items"][0]["DOI"]

        try:
            reference = cn.content_negotiation(doi)
            lines.append(reference)
            references.append(
                re.sub("^@.*{", "",
                       reference.split("\n")[0]).strip(","))
        except HTTPError:
            log.warn(f'Could not Create reference for "{title}"')

    with open(os.path.join(output_dir, "references.bib"), "w") as f:
        f.write("\n\n".join(lines))

    return references
예제 #21
0
파일: metadata.py 프로젝트: srbhp/piepdf
 def __init__(self, email=""):
     self.mailto = email
     self.arXivApi = "http://export.arxiv.org/api/query?id_list="
     self.crossrefApi = Crossref(mailto=self.mailto)
     self.regString = r"\b(10\.[0-9]{4,}(?:\.[0-9]+)*\/(?:(?![\"&\'])\S)+)\b"
     self.metadata = {
         "doi": "",
         "url": "",
         "year": "",
         "journal": "",
         "author": "",
         "title": "",
         "abstract": "",
     }
     self.page0_text = ""
예제 #22
0
def get_crossref_results(query, index=10):
    cr = Crossref()
    filters = {"type": "journal-article"}
    limit = 10
    sort = "score"
    order = "desc"
    results = cr.works(
        query_bibliographic=query,
        filters=filters,
        limit=limit,
        sort=sort,
        order=order,
    )
    results = results["message"]["items"]
    return results[:index]
예제 #23
0
def build_user_graph(graph, users, spinner, cursor):
    global crossref_email
    global vertex_dict
    global sqlite_cursor
    sqlite_cursor = cursor

    vertex_dict = {
        "paper": {},
        "journal": {},
        "subject": {},
        "author": {},
        "user": {}
    }

    total = len(users)
    counter = 1

    spinner.start()
    cr = Crossref(mailto=crossref_email)

    for uni in users:
        process_user(graph, uni, cr, counter, total, spinner)
        counter += 1

    spinner.succeed("All users inserted")
예제 #24
0
def query(request, query=None):
	if request.user.is_authenticated() == True:
		cr = Crossref()
		result = cr.works(query=query)['message']['items']
		response = []
		count = 0
		for r in result:
			if count == 10:
				break
			try: 
				title = r['title'][0]
			except:
				title = None
			if title:#Si existe titulo el documento se añade a la respuesta. De lo contrario se salta.
				timestamp = time.gmtime(int(r['created']['timestamp'])/1000)
				r['date'] = time.strftime('%d-%m-%Y', timestamp)
				r['year'] = timestamp.tm_year
				count  = count + 1
				authors = getAuthors(r)
				if(authors):
					r['author'] = authors
				response.append(r)
				#if 'author' in r:
				#	given = r['author'][0]['given'] if 'given' in r['author'][0] else ''
				#	family = r['author'][0]['family'] if 'family' in r['author'][0] else ''
				#if 'page' in r:
				#	page = r['page']
				#if 'created' in r:
				#	timestamp = time.gmtime(int(r['created']['timestamp'])/1000)
				#	date = time.strftime('%Y-%m-%d', timestamp)
				#row = '<li class="crossref-row"'
				#print title
				#if 'author' in r:
				#	row = row + 'author="' + given + ' ' + family + '"'
				#if 'page' in r:
				#	row = row + 'pages="' + page + '"'
				#if 'created' in r:
				#	row = row + 'date="' + date + '"'
				#	print '- year: ', timestamp.tm_year
				#row = row + 'issn="' + r['issn'][0] + '"'
				#row = row + 'url="' + r['url'] + '"'
				#row = row + 'doi="' + r['DOI'] + '"'
				#row = row + '>\n'
				#response = reponse + row
		return render(request, 'crossref/template.html', {'documents': response})
	else:
		return JsonResponse({'error': True, 'message':  _(u'Debe iniciar sesión.')})
예제 #25
0
def make_references(publications, output_dir):
    """
    Create reference bib file
    Args:
        publications: the list of publications
        output_dir: the output directory

    Returns:
        A list of reference identifiers
    """
    log = Logger()
    cr = Crossref()
    lines = []
    references = []

    for i, publication in enumerate(publications):
        log.notice(f'Querying and formatting {i + 1} out of {len(publications)} publications')
        link = publication[LINK]
        title = publication[TITLE]

        # Check if it is a DOI url
        if link and 'doi.org' in link:
            doi = urlparse(link).path.strip('/')

        # Extract the DOI using the title
        else:
            results = cr.works(query_title=title, limit=1)
            if results['message']['total-results'] == 0 or \
                    results['message']['items'][0]['title'][0].lower() != title.lower():
                log.warn(f'Could not find the doi for "{title}"')

                continue

            doi = results['message']['items'][0]['DOI']

        try:
            reference = cn.content_negotiation(doi)
            lines.append(reference)
            references.append(re.sub('^@.*{', '', reference.split('\n')[0]).strip(','))
        except HTTPError:
            log.warn(f'Could not Create reference for "{title}"')

    with open(os.path.join(output_dir, 'references.bib'), 'w') as f:
        f.write('\n\n'.join(lines))

    return references
예제 #26
0
def async_post_save_article_info(self, doi):

    from .models import Article
    from .signals import get_abstract, get_image_url

    article = Article.objects.get(DOI=doi)

    if not article.image_url and not article.abstract:
        cr = Crossref()
        article_meta = cr.works(ids=article.DOI)
        article.abstract = get_abstract(article_meta)
        article.image_url = get_image_url(article_meta)

        if not validators.url(article.image_url):
            article.image_url = str()

        article.save()
예제 #27
0
def my_view(request):
    references = []
    if 'DOI' in request.GET:
        doi = request.GET['DOI']
        cr = Crossref()
        logging.info(request.GET)
        # try:
        x = cr.works(doi)
        if 'message' in x and 'reference' in x['message']:
            if 'title' in x['message']:
                logging.info(u"Evaluating references for {}".format(
                    x['message']['title']))
            for r in x['message']['reference']:
                if 'DOI' not in r:
                    continue

                title = ""
                year = ""

                try:
                    y = cr.works(u'{}'.format(r['DOI']))
                    logging.info(r['DOI'])

                    if 'message' in y and 'title' in y['message']:
                        title = u"{}".format(u''.join(y['message']['title']))
                    if 'message' in y and 'author' in y['message']:
                        title += u"\n" + u", ".join(
                            [a['family'] for a in y['message']['author']])
                    if 'message' in y and 'issued' in y['message']:
                        year = y['message']['issued']['date-parts'][0][0]
                except:
                    if 'unstructured' in r:
                        title = u"{}".format(r['unstructured'])

                logging.info(title)
                references.append([
                    u"{}/{}".format(SCIHUB_URL, r['DOI']),
                    title,
                    ##authors,
                    year
                ])
        # except:
        #     pass
    logging.info(references)
    return {'refs': references}
예제 #28
0
 def __init__(self, rsrcmgr, pageno=1, laparams: LAParams = None):
     PDFPageAggregator.__init__(self,
                                rsrcmgr,
                                pageno=pageno,
                                laparams=laparams)
     self.rows = []
     self.page_number = 0
     self.doi = None
     self.cr = Crossref(mailto='*****@*****.**')
예제 #29
0
def titletodoi(keyword):
    cr = Crossref()
    result = cr.works(query=keyword)
    items = result['message']['items']
    item_title = items[0]['title']
    tmp = ''
    for it in item_title:
        tmp += it
    title = keyword.replace(' ', '').lower()
    title = re.sub(r'\W', '', title)
    # print('title: ' + title)
    tmp = tmp.replace(' ', '').lower()
    tmp = re.sub(r'\W', '', tmp)
    # print('tmp: ' + tmp)
    if (title == tmp):
        doi = items[0]['DOI']
        return doi
    else:
        return None
def build_network_graph(graph, DOIs):
	global spinner
	
	spinner.start()
	cr = Crossref(mailto = "*****@*****.**")

	for DOI in DOIs:
		process_paper(graph, DOI, cr)
	spinner.stop()

	print("Network built.")
예제 #31
0
    def get_crossref(doc_id='', query=''):
        """Return data from crossref api"""
        try:
            parser = CrossRefPaperParser()
            cr = Crossref()
            doi = doc_id
            if doi:
                try:
                    entry = cr.works(ids=[doi]).get('message')
                    return parser.parse(entry)
                except HTTPError:
                    pass
            entries = cr.works(query=query, limit=1).get('items')
            if entries:
                entry = entries[0]
                return parser.parse(entry)
        except:
            pass

        return None
def build_author_graph(graph, DOIs):
	global vertex_dict
	global spinner

	spinner.start()

	cr = Crossref(mailto = "*****@*****.**")

	for DOI in DOIs:
		process_author_paper(graph, DOI, cr)

	spinner.succeed("Author network built.")
예제 #33
0
def query_crossref(title, author):
    """Query Crossref for extracted data
    
    Args:
        title (str): The title of the paper
        author (List(str)): A list of the authors of the paper
    
    Raises:
        ExtractionError: No suitable search criteria extracted
        ExtractionError: No suitable Crossref candidates
        ExtractionError: Crossref returned an error
    
    Returns:
        str: A BibTeX entry for the queried data
    """
    # Search for the paper on Crossref
    cr = Crossref(mailto="*****@*****.**")
    # print("Querying Crossref")
    if author and title:
        r = cr.works(query=title + " " + author[0])
    elif title:
        r = cr.works(query=title)
    else:
        raise ExtractionError("No suitable search criteria extracted")
    BibTeX = ""
    print(json.dumps(r), file=open("cn.json", "w"))
    if r["status"] == "ok":
        for result in r["message"]["items"]:
            # If the titles are similar enough
            if "title" in result:
                if SequenceMatcher(None, result["title"][0].upper(),
                                   title.upper()).ratio() > 0.9:
                    # If the title is similar enough, perform content negotiaiton
                    BibTeX = cn.content_negotiation(ids=result["DOI"],
                                                    format="bibentry")
                    return BibTeX
        else:
            raise ExtractionError("No suitable Crossref candidates")
    else:
        raise ExtractionError("Crossref returned an error")
예제 #34
0
파일: search.py 프로젝트: sckott/pyminer
def search(ids=None, member=None, filter=None, limit=500, **kwargs):
    '''
    Search Crossref to get text mining links

    :param ids: [Array] DOIs (digital object identifier) or other identifiers
    :param member: [String] member ids
    :param filter: [Hash] Filter options. See ...
    :param limit: [Fixnum] Number of results to return. Not relavant when
        searching with specific dois. Default: 20. Max: 1000
    :param kwargs: any additional arguments will be passed on to
        ``requests.get``

    :return: A dictionary, of results

    Usage::

        from pyminer import miner
        miner.search(filter = {'has_full_text': True}, limit = 5)
        miner.search(filter = {'full_text_type': 'text/plain', 'license_url': "http://creativecommons.org/licenses/by-nc-nd/3.0"})
        miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
    '''
    cr = Crossref()
    return Response(cr.works(ids = ids, limit = limit, filter = filter, **kwargs))
예제 #35
0
파일: search.py 프로젝트: sckott/pyminer
def search(ids=None, member=None, filter=None, limit=500, **kwargs):
    '''
    Search Crossref

    :param ids: [Array] DOIs (digital object identifier) or other identifiers
    :param member: [String] member ids
    :param filter: [Hash] Filter options. See ...
    :param limit: [Fixnum] Number of results to return. Not relavant when
        searching with specific dois. Default: 20. Max: 1000
    :param kwargs: any additional arguments will be passed on to
        `requests.get`

    :return: A dictionary, of results

    Usage::

        from pyminer import search
        search.search(ids = "10.1371/journal.pone.0000308")
        search.search(filter = {'has_full_text': True})
        search.search(filter = {'full_text_type': 'text/plain'})
    '''
    cr = Crossref()
    return Response(cr.works(ids = ids, limit = limit, filter = filter, **kwargs))
예제 #36
0
def main():
    """
    NAME
      magic_geomagia.py

    DESCRIPTION
       Takes a MagIC file and outputs data for easier input into Max Brown's GEOMAGIA database

    SYNTAX
       magic_geomagia.py [command line options]

    OPTIONS
        -h: prints the help message and quits.
        -f FILE: the MagIC data file name that will be converted to GEOMAGIA files
    
    OUTPUT:
       print to stdout the GEOMAGIA insert command for the reference and all of the site level data 

    EXAMPLE:
        magic_geomagia.py -f magic_contribution_16578.txt

        Nick Jarboe
    """
    if '-h' in sys.argv: # check if help is needed
        print(main.__doc__)
        sys.exit() # graceful quit
    if '-f' in sys.argv:
        ind=sys.argv.index('-f')
        file_name=sys.argv[ind+1]
    else:
        print("MagIC file name needed. Please add the file name after the -f option.")


#   Create all the table files from the magic.txt file so they can be imported by the cb
    command = "download_magic.py -f " + file_name
    os.system(command)

    md = cb.Contribution()  #md stands for magic file data
    md.propagate_location_to_measurements()
    md.propagate_location_to_specimens()
    md.propagate_location_to_samples()
    if not md.tables:
        print('-E- No MagIC tables could be found in this directory')
        error_log("No MagIC tables found")
        return

    doi=md.tables['contribution'].df.iloc[0]['reference']
    id=md.tables['contribution'].df.iloc[0]['id']
    timestamp=md.tables['contribution'].df.iloc[0]['timestamp']
    contributor=md.tables['contribution'].df.iloc[0]['contributor']
    print("c=",contributor)
    contributor=contributor.replace('@','')
    print("c=",contributor)
   
    cr = Crossref()
    ref=cr.works(doi)
    
#    authors = "Doe J.X., Alexander,T.G."
    status= ref["status"]
    message= ref["message"]
#    print("message=",message)
    authors= message["author"]
#    print("authors=",authors)
    authorList=""
    for author in authors:
#        print ("Name:",author['given'], author['family']) 
        author_given=""
        names=author['given'].split(' ')
        for name in names:
            author_given +=name[0]+"."
        authorList += author['family'] + " " + author_given + ", " 
#    print(authorList)
    authorList=authorList[:-2]
#    print(authorList)

    title = message['title'][0]
    year = message['created']['date-parts'][0][0]
#    print(year)
    journal = message['short-container-title'][0]
    volume = message['volume']
#    print(volume)
    pages='0'
    if "page" in message.keys():
        pages = message['page']
#    print(pages)
    url = "https://earthref.org/MagIC/doi/" + doi

    print("REFS") 
    print("Insert into REFS values(NULL,'", authorList, "','", title, "', ", year, ", '", journal, "', ", volume, ", '", pages, "', '", doi, "', '", url, "');", sep='')
    
    print()
    print("ARCHEODIJ") 
    
    sites=md.tables['sites'].df
    locations=md.tables['locations'].df

    print("UID,NUM_SAMPLES,NUM_ACC_SPEC,NUM_MEAS_SPEC,BA,SIGMA_BA,AGE, AGE_MIN,AGE_MAX,NUM_SIGMAS,AGE_ERROR_TYPE_ID,SITE_LAT, SITE_LON,VADM,SIGMA_VADM,SITE_ID,PI_METHODS_ID,AC_ID,MD_CK_ ID,AN_CORR_ID,CR_CORR_ID,DM_METHOD_ID,AF_STEP,T_STEP,DM_ ANALYSIS_ID,SPECIMEN_TYPE_ID,MATERIAL_ID,REFERENCE_ID,NUM_ C14_SAMPLES,C14_ID,CALIB_C14_AGE,CALIB_C14_AGE_SIGMA_MIN, CALIB_C14_AGE_SIGMA_MAX,NUM_C14_SIGMAS,CALC_CALIB_C14_AGE, CALC_CALIB_C14_AGE_SIGMA_MIN,CALC_CALIB_C14_AGE_SIGMA_MAX, C14_CALIB_SOFTWARE_ID,CALC_C14_CALIB_SOFTWARE_ID,C14_CALIB_DATASET_ID,CALC_C14_ CALIB_DATASET_ID,DENDRO_ID,TOT_NUM_DENDRO,NUM_DENDRO_ USED,DATING_METHOD_ID,NUM_DIR_SAMPLES,NUM_DIR_SPECIMENS,NUM_ DIR_SPEC_COLLECTED,DECL,INCL,ALPHA_95,K,VDM,SIGMA_VDM,SAMPLE_ID,c_csv,SITE_NAME, SITE_HORIZON,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014, SUPERSEEDED,UPLOAD_YEAR,UPLOAD_MONTH,UPLOADER,EDITOR,EDIT_DATE,NOTES")

    for index, row in sites.iterrows():
        int_n_samples,int_n_specimens,int_n_total_specimens,int_abs,int_abs_sigma=-1,-1,-1,-1,-1
        if 'int_n_samples' in sites.columns.values: 
            int_n_samples=row['int_n_samples']
        if 'int_n_specimens' in sites.columns.values: 
            int_n_specimens=row['int_n_specimens']
        if 'int_n_total_specimens' in sites.columns.values: 
            int_n_total_specimens=row['int_n_total_specimens']

        if int_n_specimens == -1 and int_n_samples >0:
            int_n_spcimens = int_n_samples

        if 'int_abs' in sites.columns.values: 
            int_abs=row['int_abs']
            if int_abs is not None:
                int_abs=round(int_abs*1e6,1)
        if 'int_abs_sigma' in sites.columns.values: 
            int_abs_sigma=row['int_abs_sigma']
            if int_abs_sigma is not None:
                int_abs_sigma=round(row['int_abs_sigma']*1e6,1)

        age,age_high,age_low=-1e9,-1e9,-1e9
        age_error_type='0'  #  
        
        if 'age_unit' not in sites.columns.values: 
            print("Malformed Magic sites data table. Required column row 'age_unit' is missing")
            sys.exit()
        age_unit=row['age_unit']
        if 'age' in sites.columns.values: 
            age=row['age'] 
            age=pmag.age_to_BP(age,age_unit)
        if 'age_high' in sites.columns.values: 
            age_high=row['age_high'] 
            age_high=pmag.age_to_BP(age_high,age_unit)
        if 'age_low' in sites.columns.values: 
            age_low=row['age_low'] 
            age_low=pmag.age_to_BP(age_low,age_unit)
        if 'age_sigma' in sites.columns.values: 
            age_sigma=row['age_sigma'] 
            age_sigma=pmag.age_to_BP(age_sigma,age_unit)
            age_high=age+age_sigma
            age_low=age-age_sigma
            age_error_type='5'  #Magic is one sigma for all sigma state/province column to data modelages

        if age_low > age_high: # MagIC lets age_high and age_low be in any order. Fix that for GEOMAGIA 
            temp=age_high
            age_high=age_low
            age_low=temp
        if age == -1e9:               # If only age_low and age_high are in the MagIC file then calculate the age.
            age=(age_high+age_low)/2
            age_error_type='8'  #If MagIC age only high and low then error type is "range"

        age_min=age-age_low  # GEOMAGIA has the max and min as differences from the age, not absolute. 
        age_max=age_high-age
        age_BP=age
        age=1950-age  #GEOMAGIA want +-AD/BC so convert BP to AD/-BC

        lat=row['lat']
        lon=row['lon']

        vadm,vadm_sigma=-1,-1
             
        if 'vadm' in sites.columns.values: 
            vadm=row['vadm'] 
            vadm=vadm/1e22
        if 'vadm_sigma' in sites.columns.values: 
            vadm=row['vadm'] 
            vadm=vadm/1e22

        site_name=row['site'] 
        
#       For paleointensity codes just give the method code list and Max will decide on the right 
#       GEOMAGIA code.
        method_codes="No MagIC method codes available"
        if 'method_codes' in sites.columns.values: 
            method_codes=row['method_codes']

#       Just give Max all the method codes for him to decide for now
        paleointensity_procedure=method_codes
        
        alteration_monitor="0"
        alteration_monitor=method_codes_to_geomagia(method_codes,'ALTERATION_MONIT_CORR')
        multidomain_check="0" 
        multidomain_check=method_codes_to_geomagia(method_codes,'MD_CHECKS')
        anisotropy_correction="0"
        anisotropy_correction=method_codes_to_geomagia(method_codes,'ANISOTROPY_CORRECTION')
        cooling_rate="0"
        cooling_rate=method_codes_to_geomagia(method_codes,'COOLING_RATE')
        demag_method="0"
        demag_method=method_codes_to_geomagia(method_codes,'DM_METHODS')
        demag_analysis="0"
        demag_analysis=method_codes_to_geomagia(method_codes,'DM_ANALYSIS')
        specimen_shape="0"
        specimen_shape=method_codes_to_geomagia(method_codes,'SPECIMEN_TYPE_ID')
        
        materials="" 
        geologic_types="" 
        if 'geologic_types' in sites.columns.values: 
            geologic_types=row['geologic_types'] 
        if ":" in geologic_types:
            gtypes=geologic_types.split(":")
            for gtype in gtypes:
                materials=materials+pmag.vocab_convert(gtype,"geomagia")+":"
            materials=materials[:-1]
        else:
            materials=pmag.vocab_convert(geologic_types,"geomagia")
       
        geochron_codes="" 
        if ":" in method_codes:
            gcodes=method_codes.split(":")
            for gcode in gcodes:
                if "GM-" == gcode[:3]:
                    geochron_codes=geochron_codes+pmag.vocab_convert(gcode,"geomagia")+":"
            geochron_codes=geochron_codes[:-1]
        else:
            geochron_codes=pmag.vocab_convert(geochron_codes,"geomagia")
        if geochron_codes == "": 
            geochron_codes="0"

        dir_n_samples="-1"
        if 'dir_n_samples' in sites.columns.values: 
            dir_n_samples=row['dir_n_samples']
 
        dir_n_samples="-1"
        if 'dir_n_samples' in sites.columns.values: 
            dir_n_samples=row['dir_n_samples']

#       Not in MagIC
        dir_n_specimens="-1"

#       using total number of samples for total specimen number
        dir_n_total_samples="-1"
        if 'dir_n_total_samples' in sites.columns.values: 
            dir_n_total_samples=row['dir_n_total_samples']

        dir_dec="999"
        if 'dir_dec' in sites.columns.values: 
            dir_dec=row['dir_dec']

        dir_inc="999"
        if 'dir_inc' in sites.columns.values: 
            dir_inc=row['dir_inc']

        dir_alpha95="-1"
        if 'dir_alpha95' in sites.columns.values: 
            dir_alpha95=row['dir_alpha95']

        dir_k="-1"
        if 'dir_k' in sites.columns.values: 
            dir_k=row['dir_k']

        vdm=-1
        if 'vdm' in sites.columns.values: 
            vdm=float(row['vdm'])
            vdm=vdm/1e22

        vdm_sigma=-1
        if 'vdm_sigma' in sites.columns.values: 
            vdm_sigma=float(row['vdm_sigma'])
            vdm_sigma=vdm_sigma/1e22

# Could try and get sample names from samples table (using Contribution object) but just taking the list 
# if it exists for now.
        sample_list="-1"
        if 'samples' in sites.columns.values: 
            sample_list=row['samples']

# c_csv is in GEOMAGIA insert. What it is I don't know. Max said set to 0
        c_csv='0'

# This place_id is SITE_ID in GEOMAGIA

        place_id="0"
        location=row['location']
        if 'state_province' in locations.columns.values: 
            place=locations.loc[location,'state_province']
            if place != "":
                place_id=pmag.vocab_convert(place,'GEOMAGIA')
        if place_id == "0":
            if 'country' in locations.columns.values: 
                place=locations.loc[location,'country']
                if place != "":
                    place_id=pmag.vocab_convert(place,'GEOMAGIA')
        if place_id == "0":
            if 'continent_ocean' in locations.columns.values: 
                place_id=locations.loc[location,'continent_ocean']
                if place != "":
                    place_id=pmag.vocab_convert(place,'GEOMAGIA')

        site=row['site']
        dt=dateutil.parser.parse(timestamp)

        description="-1" 
        if 'description' in sites.columns.values: 
            description=row['description'] 

        if age_BP <= 50000:
            print("0",int_n_samples,int_n_specimens,int_n_total_specimens,int_abs,int_abs_sigma,age,age_min,age_max,"1",age_error_type,lat,lon,vadm,vadm_sigma,place_id,paleointensity_procedure,alteration_monitor,multidomain_check,anisotropy_correction,cooling_rate,demag_method,"0","0",demag_analysis,specimen_shape,materials,doi,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1",geochron_codes,dir_n_samples,dir_n_samples,dir_n_total_samples,dir_dec,dir_inc,dir_alpha95,dir_k,vdm,vdm_sigma,sample_list,c_csv,location,site,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1","-1",dt.year,dt.month,contributor,"-1,-1",description,sep=',')
예제 #37
0
파일: providers.py 프로젝트: jdumas/autobib
def crossref_query(authors, title):
    """
    Query Crossref database.

    Args:
        authors (list): a list of strings for up the first authors last names.
        title (str): the title of the article.
        filename (str): the original path of the file to link to.

    Returns:
        A tuple (bibtex, json, score) where the first element is the data in
        bibtex format (returned as a record/dict), the second element is the
        data returned in json format, and the third element is the score of the
        match given by Crossref.
    """
    cr = Crossref()
    # works?query.title=An+Improved+Adaptive+Constraint+Aggregation+for+Integrated+Layout+and+Topology+Optimization&query.author=Gao+Zhu+Zhang+Zhou&sort=score&rows=1
    # query = ['+' + name + '' for name in authors]
    # query = 'query.title=' + urllib.parse.quote_plus(title) + '&query.author=' + urllib.parse.quote_plus(' '.join(authors)) + '&sort=score&rows=1'
    # print(query)
    if ''.join(authors):
        args = dict(
            query_title=urllib.parse.quote_plus(title),
            query_author=urllib.parse.quote_plus(' '.join(authors))
        )
    else:
        args = dict(
            query=urllib.parse.quote_plus(title),
        )
    x = cr.works(sort='score', limit=1, **args)
    # x = cr.works(query=query)
    assert x['status'] == "ok"

    # No result found
    if not x['message']['items']:
        print_score(0)
        return (None, [], 0)

    best_item = x['message']['items'][0]
    # print(json.dumps(best_item, indent=4))
    for item in x['message']['items']:
        if item['score'] < best_item['score']:
            break
        else:
            best_item = pick_best(title, best_item, item)

    # Retrieve DOI and json item
    doi = best_item['DOI']
    res_json = best_item

    # If the entry is invalid, return a score of 0
    if 'author' not in res_json or not res_json['title']:
        print_score(0)
        return (None, res_json, 0)

    # Retrieve metadata as bibtex entry
    res_bib = cn.content_negotiation(ids=doi, format="bibentry")
    res_bib = re.sub('ä', 'ä', res_bib)
    res_bib = re.sub('Ö', 'Ö', res_bib)
    res_bib = re.sub('รถ', 'ö', res_bib)
    res_bib = re.sub('Ăź', 'ü', res_bib)
    res_bib = re.sub('̈o', 'ö', res_bib)
    res_bib = re.sub('ďż˝', 'ø', res_bib)
    res_bib = re.sub('ĂŤ', 'ë', res_bib)
    db = bibtexparser.loads(res_bib)
    assert len(db.entries) == 1
    res_bib = db.entries[0]

    # If article has subtitle(s), fix bibtex entry
    subtitles = None
    if 'subtitle' in res_json:
        subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)]

    if subtitles:
        # Discard subtitle that are all uppercase
        title = ' '.join(res_json['title'])
        subtitle = ' '.join(subtitles)
        if title.lower().startswith(subtitle.lower()) or utils.simratio(title, subtitle) > 0.95:
            # Don't repeat title if the subtitle is too similar to the title
            new_title = title
        else:
            new_title = title + ": " + subtitle
        res_bib['title'] = new_title
    else:
        new_title = ' '.join(res_json['title'])
        res_bib['title'] = new_title

    # Post-process title
    res_bib['title'] = re.sub('\\*$', '', res_bib['title'])
    res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title'])
    res_bib['title'] = re.sub('\\.*$', '', res_bib['title'])

    # If bibtex entry has a 'journal' field, then use the longest alias from the json
    if 'journal' in res_bib:
        best = ""
        for container in res_json['container-title']:
            if len(container) > len(best):
                best = container
        res_bib['journal'] = best

    # If entry is missing the year, set score to 0
    score = res_json['score']
    if 'year' not in res_bib:
        score = 0

    # Fix incorrect year in crossref entry
    if 'published-print' in res_json:
        item = res_json['published-print']
        if 'date-parts' in item and len(item['date-parts']) == 1:
            date = item['date-parts'][0]
            year = date[0]
            month = date[1] if len(date) > 1 else None
            if str(year) != res_bib['year']:
                res_bib['year'] = str(year)
                if month is None and 'month' in res_bib:
                    del res_bib['month']
                elif month is not None:
                    assert month >= 1 and month <= 12
                    month_str = utils.MONTHS[month - 1]
                    res_bib['month'] = month_str

    # Fix potential ambiguous author entries
    msg = utils.fix_author_field(res_bib, res_json)

    print('C: ' + nomenclature.gen_filename(res_bib))
    print_score(score)

    # If score is above threshold, display msg from fix_author_field
    if score >= config.crossref_accept_threshold and msg:
        print(msg)

    # Return database entry
    return (res_bib, res_json, score)