def get_aliases_from_product_id_strings(product_id_strings): aliases = [] logger.debug(u"in get_aliases_from_product_id_strings with product_id_strings {product_id_strings}".format( product_id_strings=product_id_strings)) for nid in product_id_strings: logger.debug(u"in get_aliases_from_product_id_strings nid 1 {nid}".format( nid=nid)) nid = remove_nonprinting_characters(nid) logger.debug(u"in get_aliases_from_product_id_strings nid 2 {nid}".format( nid=nid)) nid = nid.strip() # also remove spaces logger.debug(u"in get_aliases_from_product_id_strings with cleaned nid {nid}".format( nid=nid)) if is_doi(nid): aliases += crossref.Crossref().member_items(nid) elif is_pmid(nid): aliases += pubmed.Pubmed().member_items(nid) elif is_arxiv(nid): aliases += arxiv.Arxiv().member_items(nid) elif is_url(nid): aliases += webpage.Webpage().member_items(nid) logger.debug(u"in get_aliases_from_product_id_strings with cleaned aliases {aliases}".format( aliases=aliases)) return aliases
def clean_doi(input_doi): input_doi = remove_nonprinting_characters(input_doi) try: input_doi = input_doi.lower() if input_doi.startswith("http"): match = re.match("^https*://(dx\.)*doi.org/(10\..+)", input_doi) doi = match.group(2) elif "doi.org" in input_doi: match = re.match("^(dx\.)*doi.org/(10\..+)", input_doi) doi = match.group(2) elif input_doi.startswith("doi:"): match = re.match("^doi:(10\..+)", input_doi) doi = match.group(1) elif input_doi.startswith("10."): doi = input_doi elif "10." in input_doi: match = re.match(".*(10\.\d+.+)", input_doi, re.DOTALL) doi = match.group(1) else: doi = None try: logger.debug(u"MALFORMED DOI {input_doi}".format( input_doi=input_doi)) except: logger.debug(u"MALFORMED DOI, can't print doi") except AttributeError: doi = None return doi
def clean_id(nid): try: nid = nid.strip(' "').strip() nid = unicode_helpers.remove_nonprinting_characters(nid) except (TypeError, AttributeError): #isn't a string. That's ok, might be biblio pass return(nid)
def clean_pmid(input_pmid): try: pmid = remove_nonprinting_characters(input_pmid) pmid = pmid.lower().replace("pmid:", "") match = re.match("^(\d{3,15})$", pmid) if match: pmid = match.group(1) else: pmid = None except AttributeError: pmid = None if not pmid: logger.debug(u"MALFORMED PMID {input_pmid}".format( input_pmid=input_pmid)) return pmid
def clean_arxiv_id(arxiv_id): arxiv_id = remove_nonprinting_characters(arxiv_id) arxiv_id = arxiv_id.lower().replace("arxiv:", "").replace("http://arxiv.org/abs/", "") return arxiv_id
def clean_url(input_url): url = remove_nonprinting_characters(input_url) return url