def get_aliases_from_product_id_strings(product_id_strings):
    aliases = []

    logger.debug(u"in get_aliases_from_product_id_strings with product_id_strings {product_id_strings}".format(
        product_id_strings=product_id_strings))

    for nid in product_id_strings:
        logger.debug(u"in get_aliases_from_product_id_strings nid 1 {nid}".format(
            nid=nid))

        nid = remove_nonprinting_characters(nid)

        logger.debug(u"in get_aliases_from_product_id_strings nid 2 {nid}".format(
            nid=nid))

        nid = nid.strip()  # also remove spaces

        logger.debug(u"in get_aliases_from_product_id_strings with cleaned nid {nid}".format(
            nid=nid))

        if is_doi(nid):
            aliases += crossref.Crossref().member_items(nid)
        elif is_pmid(nid):
            aliases += pubmed.Pubmed().member_items(nid)
        elif is_arxiv(nid):
            aliases += arxiv.Arxiv().member_items(nid)
        elif is_url(nid):
            aliases += webpage.Webpage().member_items(nid)

        logger.debug(u"in get_aliases_from_product_id_strings with cleaned aliases {aliases}".format(
            aliases=aliases))

    return aliases
def clean_doi(input_doi):
    input_doi = remove_nonprinting_characters(input_doi)
    try:
        input_doi = input_doi.lower()
        if input_doi.startswith("http"):
            match = re.match("^https*://(dx\.)*doi.org/(10\..+)", input_doi)
            doi = match.group(2)
        elif "doi.org" in input_doi:
            match = re.match("^(dx\.)*doi.org/(10\..+)", input_doi)
            doi = match.group(2)
        elif input_doi.startswith("doi:"):
            match = re.match("^doi:(10\..+)", input_doi)
            doi = match.group(1)
        elif input_doi.startswith("10."):
            doi = input_doi
        elif "10." in input_doi:
            match = re.match(".*(10\.\d+.+)", input_doi, re.DOTALL)
            doi = match.group(1)
        else:
            doi = None
            try:
                logger.debug(u"MALFORMED DOI {input_doi}".format(
                    input_doi=input_doi))
            except:
                logger.debug(u"MALFORMED DOI, can't print doi")


    except AttributeError:
        doi = None

    return doi
def clean_id(nid):
    try:
        nid = nid.strip(' "').strip()
        nid = unicode_helpers.remove_nonprinting_characters(nid)
    except (TypeError, AttributeError):
        #isn't a string.  That's ok, might be biblio
        pass
    return(nid)
示例#4
0
def clean_pmid(input_pmid):
    try:
        pmid = remove_nonprinting_characters(input_pmid)
        pmid = pmid.lower().replace("pmid:", "")
        match = re.match("^(\d{3,15})$", pmid)
        if match:
            pmid = match.group(1)
        else:
            pmid = None

    except AttributeError:
        pmid = None

    if not pmid:
        logger.debug(u"MALFORMED PMID {input_pmid}".format(
            input_pmid=input_pmid))

    return pmid
示例#5
0
def clean_arxiv_id(arxiv_id):
    arxiv_id = remove_nonprinting_characters(arxiv_id)    
    arxiv_id = arxiv_id.lower().replace("arxiv:", "").replace("http://arxiv.org/abs/", "")
    return arxiv_id
def clean_url(input_url):
    url = remove_nonprinting_characters(input_url)
    return url