Exemplo n.º 1
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Exemplo n.º 3
def convertOneChunk(inIndexFile, outFile):
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    store = pubStore.PubWriterFile(outFile)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    doi2pmid = None
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    for row in inRows:
        # read line
        articleId, baseDir = row.articleId, row.baseDir
        zipFilename, filename = row.zipFilename, row.filename

        # open file from zipfile
        fullZipPath = join(baseDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        if doi2pmid==None:
            doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        xmlTree   = pubXml.etreeFromXml(xmlString)

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
        if articleData["doi"] in doi2pmid:
           articleData["pmid"] = doi2pmid[articleData["doi"]]

        pii = splitext(basename(filename))[0]

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
        store.writeArticle(articleId, articleData)

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files" % convCount)
Exemplo n.º 4
def minimalHtmlToDicts(url, content):
    " a minimalistic article dict filler, does not try to parse the html "
    logging.debug("Falling back to minimal html to text")
    fileDict = pubStore.createEmptyFileDict(url=url, content=content, mimeType="text/html")
    fileDict = pubGeneric.toAsciiEscape(fileDict, mimeType="text/html")
    if fileDict==None or not "content" in fileDict:
        return None, None
    text = fileDict["content"]
    title = unidecode.unidecode(content[:100])
    abstract = unidecode.unidecode(content[100:1000])
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url, \
        title=title, abstract=abstract, externalId=url)
    #if fileDict==None: #continue
    return artDict, fileDict
Exemplo n.º 6
def parseMedline(xmlParser):
    fill article data dict with pubmed xml data

    >>> xml = PubmedTestDoc()
    >>> data = parseMedline(maxXml.XmlParser(string=xml))
    >>> del data["time"]
    >>> repr(data)
     "OrderedDict([('articleId', ''), ('externalId', 'PMID20430833'), ('source', ''), ('origFile', ''), ('journal', 'Brain : a journal of neurology'), ('printIssn', '0006-8950'), ('eIssn', '0006-8950'), ('journalUniqueId', '0372537'), ('year', '2010'), ('articleType', 'research-article'), ('articleSection', ''), ('authors', u'Willemsen, Mich\\\\xe9l A; Verbeek, Marcel M'), ('authorEmails', ''), ('authorAffiliations', 'Radboud University Nijmegen Medical Centre, Donders Institute for Brain, Cognition and Behaviour, Department of Paediatric Neurology (820 IKNC), PO Box 9101, 6500 HB Nijmegen, The Netherlands. [email protected]'), ('keywords', 'Age of Onset/Useless Research'), ('title', 'Tyrosine hydroxylase deficiency: a treatable disorder of brain catecholamine biosynthesis.'), ('abstract', 'An infantile onset, progressive, hypokinetic-rigid syndrome with dystonia (type A), and a complex encephalopathy with neonatal onset (type B). Decreased cerebrospinal fluid concentrations of homovanillic acid and c.698G>A and c.707T>C mutations. Carriership of at least one promotor mutation, however, apparently predicts type A tyrosine hydroxylase deficiency. Most patients with tyrosine hydroxylase deficiency can be successfully treated with l-dopa.'), ('vol', '133'), ('issue', 'Pt 6'), ('page', '1810-22'), ('pmid', '20430833'), ('pmcId', ''), ('doi', ''), ('fulltextUrl', 'https://www.ncbi.nlm.nih.gov/pubmed/20430833')])"

    data = pubStore.createEmptyArticleDict()
    #medlineData           = xmlParser.getXmlFirst("MedlineCitation")
    medlineData           = xmlParser
    data["pmid"]          = medlineData.getTextFirst("PMID")
    el = medlineData.getElFirst("PMID", None)
    data["pmidVersion"]   = el.attrib.get("Version", "") if el is not None else ""
    data["externalId"]    = "PMID"+data["pmid"]
    data["fulltextUrl"]   = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % data["pmid"]
    logging.log(5, "PMID %s" % data["pmid"])
    data["medlineCreatedDate"] = getMedlineDate(medlineData, "DateCreated")
    data["medlineCompletedDate"] = getMedlineDate(medlineData, "DateCompleted")
    data["medlineRevisedDate"] = getMedlineDate(medlineData, "DateRevised")
    otherIds         = medlineData.getTextAll("OtherID", reqAttrDict={"Source":"NLM"})
    pmcIds = [i for i in otherIds if i.startswith("PMC")]
    if len(pmcIds) > 0:
        data["pmcId"] = pmcIds[0].split()[0].replace("PMC","")

    artTree               = medlineData.getXmlFirst("Article")

    data["title"]         = getMedlineText(artTree.getXmlAll("ArticleTitle"))

    # handle structured abstracts
    data["abstract"]      = getMedlineText(artTree.getXmlAll("Abstract/AbstractText"))

    if data["abstract"]=="":
        data["abstract"]      = getMedlineText(artTree.getXmlAll("OtherAbstract/AbstractText"))

    data["authorAffiliations"]   = artTree.getTextFirst("Affiliation", default="")
    data["doi"]           = artTree.getTextFirst("ELocationID", default="", reqAttrDict={"EIdType":"doi"})
    data["lang"]   = artTree.getTextFirst("Language", default="")

    data["journalUniqueId"] = medlineData.getTextFirst("MedlineJournalInfo/NlmUniqueID")
    linkingIssn = medlineData.getTextFirst("MedlineJournalInfo/ISSNLinking", default="")

    journalTree = artTree.getXmlFirst("Journal")
    data["eIssn"]       = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": 'Electronic'}, default="")
    data["printIssn"]   = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": 'Print'}, default="")
    # keep the link ISSN when we have space, e.g. PNAS is not storing the print ISSN anymore, only as link Issn
    if data["printIssn"]=="" and linkingIssn!="":
        data["printIssn"]   = linkingIssn
    if data["eIssn"]=="" and linkingIssn!="":
        data["eIssn"]   = linkingIssn

    data["vol"]         = journalTree.getTextFirst("JournalIssue/Volume", default="")
    data["issue"]       = journalTree.getTextFirst("JournalIssue/Issue", default="")
    data["year"]        = journalTree.getTextFirst("JournalIssue/PubDate/Year", default="")
    if data["year"]=="":
        year = journalTree.getTextFirst("JournalIssue/PubDate/MedlineDate", default="").split()[0]
        if not year.isdigit():
            year = ""
        data["year"] = year
    data["journal"]     = journalTree.getTextFirst("Title", default="")
    data["page"]        = artTree.getTextFirst("Pagination/MedlinePgn", default="")

    authorList  = artTree.getXmlFirst("AuthorList")
    lastNames   = []
    initialList = []
    if authorList!=None:
        authorTrees = authorList.getXmlAll("Author")
        for authorTree in authorTrees:
            lastName = authorTree.getTextFirst("LastName", default="")
            if lastName=="":
                lastName = authorTree.getTextFirst("CollectiveName", default="")

            initials = authorTree.getTextFirst("ForeName", default="")
            if initials=="":
                initials = authorTree.getTextFirst("Initials", default="")

    authors = [lastNames[i]+", "+initialList[i] for i in range(0, min(len(lastNames), len(initialList)))]
    data["authors"]="; ".join(authors)

    articleTypeList = set(artTree.getTextAll("PublicationTypeList/PublicationType"))
    articleTypesString  = ",".join(articleTypeList)


    noResearchArticleTags = ["Bibliography", "Biography",
        "Case Reports", "Webcasts",
        "Dictionary", "Directory",
        "Editorial", "Festschrift",
        "Patient Education Handout", "Periodical Index",
        "Portraits", "Published Erratum", "Scientific Integrity Review"

    if "Review" in articleTypeList:
       articleType = "review"
    elif "Letter" in articleTypeList:
       articleType = "research-article"
        for noResearchArticleTag in noResearchArticleTags:
            if noResearchArticleTag in articleTypeList:
                articleType = "other"

    data["articleType"]        = articleType
    #data["pubmedArticleTypes"] = articleTypesString

    logging.log(5, "pubmedArticleTypes %s, articleType %s" % (articleTypesString, articleType))

    meshDescriptors = []
    meshHeadingList       = medlineData.getXmlFirst("MeshHeadingList", default="")
    if meshHeadingList:
        #for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName", reqAttrDict={"MajorTopicYN":"Y"}):
        for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName"):

    data["keywords"] = "/".join(meshDescriptors)

    # remove these annoying linebreaks!
    filtData = {}
    for key, val in data.iteritems():
        filtData[key] = val.replace(u'\u2028', ' ')
    return filtData
Exemplo n.º 7
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        #if doi2pmid==None:
            #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
        #if articleData["doi"] in doi2pmid:
           #articleData["pmid"] = doi2pmid[articleData["doi"]]

        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
        store.writeArticle(articleId, articleData)

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])]

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
Exemplo n.º 8
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile),
                   basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId = int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" %
                      (fullZipPath, filename, len(inRows) - i))
        #if doi2pmid==None:
        #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" %
                          (zipFilename, filename))

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
        articleData["origFile"] = zipFilename + ":" + filename
        #if articleData["doi"] in doi2pmid:
        #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"] = pii
            "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString == None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
        store.writeArticle(articleId, articleData)

        # write IDs to separate file
        idRow = [
            str(articleData["articleId"]), articleData["doi"],

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000 * (articleId)) + 1,
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
Exemplo n.º 10
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u"\xbf" in filename:
            logging.info("Found weird character, skipping file")

        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename == "":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename + ":" + filename

        if pdfString == None:
            pdfNotFound += 1
            logging.error("Could not open pdf or xml file")

        articleId = int(articleId)

        # parse xml
        logging.debug("Parsing XML")
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))

        articleData = parseXml(xmlTree, articleData)

        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"] = zipFilename + "/" + filename
        articleData["externalId"] = articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]


        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
