Exemplo n.º 1
0
def convertOneChunk(gzDir, idFname, inIndexFile, outFile):
    # for each row in index:
    store = pubStore.PubWriterFile(outFile)
    donePiis = pubGeneric.parseDoneIds(idFname)

    # log to file
    outBase = join(dirname(outFile), basename(outFile).split(".")[0])
    logFname = outBase + ".log"
    pubGeneric.setupLogging(__file__, None, logFileName=logFname)

    idFname = outBase + "_ids.tab"
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\texternalId\n")

    lastTsvFname = None
    tsvFile = None
    pmidFinder = pubCompare.PmidFinder()
    for row in maxCommon.iterTsvRows(inIndexFile, encoding=None):
        # open file and seek, if necessry
        if tsvFile == None or lastTsvFname != row.tsvFile:
            logging.debug("Seeking to %s in tsvfile %s" %
                          (row.offset, row.tsvFile))
            tsvFile = gzip.open(join(gzDir, row.tsvFile))
            tsvFile.seek(int(row.offset))
        lastTsvFname = row.tsvFile

        line = tsvFile.readline()

        if row.url.startswith("!"):
            logging.info("Ignoring %s, marked as duplicated" % row.url)
            continue
        #fields are: ["articleId", "tsvFile", "url", "offset"]
        fields = line.split("\t")
        url = fields[0]
        logging.debug("Replacing weird bing chars")
        content = fields[-1]
        assert (url == row.url)
        assert (len(content) != 0)
        url = url.decode("utf8")

        logging.debug("Converting to text")
        content = convertMicrosoft(content)
        artDict, fileDict = convertHtmlToDicts(url, content)
        if artDict == None:
            artDict, fileDict = minimalHtmlToDicts(url, content)
        if artDict == None:
            continue
        artDict["pmid"] = pmidFinder.lookupPmid(artDict)
        # write file
        articleId = int(row.articleId)
        fileId = articleId * 1000
        store.writeFile(articleId, fileId, fileDict)
        store.writeArticle(articleId, artDict)
    store.close()
Exemplo n.º 2
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile),
                   basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\texternalId\tpmid\n")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId = int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1
            continue
        donePiis.add(pii)

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" %
                      (fullZipPath, filename, len(inRows) - i))
        #if doi2pmid==None:
        #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" %
                          (zipFilename, filename))
            continue

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"] = zipFilename + ":" + filename
        #if articleData["doi"] in doi2pmid:
        #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"] = pii
        articleData[
            "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString == None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write IDs to separate file
        idRow = [
            str(articleData["articleId"]), articleData["doi"],
            articleData["externalId"],
            str(articleData["pmid"])
        ]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000 * (articleId)) + 1,
                        fileData,
                        externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
    store.close()
    idFh.close()
Exemplo n.º 3
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u'\xbf' in filename:
            logging.info("Found weird character, skipping file")
            continue
        
        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename=="":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename+":"+filename

        if pdfString==None:
            pdfNotFound+=1
            logging.error("Could not open pdf or xml file")
            continue

        articleId=int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"]=zipFilename+"/"+filename
        articleData["externalId"]=articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()