Python MapReduce示例，utils.parallel.MapReduce Python示例

示例#1

0

显示文件

文件： medlineXMLtoDate.py 项目： mjoppich/miRExplore

                            pmid2types[entry.pmid].add(dtype)


                    except:

                        traceback.print_exc()

                        eprint("Exception", datefile)
                        try:

                            pmid = elem.find('MedlineCitation/PMID').text
                            eprint(pmid)

                        except:
                            pass

                        continue

                for x in pmid2date:
                    print(x, "\t".join([str(x) for x in pmid2date[x]]), sep="\t", file=outdate)

                for x in pmid2types:
                    for doctype in pmid2types[x]:
                        print(x, doctype, sep="\t", file=outtype)


    ll = MapReduce(6)
    result = ll.exec( allfiles, senteniceFile, None, 1, None)

    print("Done")

示例#2

0

显示文件

文件： createFMAAssoc.py 项目： mjoppich/miRExplore

        return None

    threads = 4

    if __debug__:
        threads = 1
        sys.stderr.write("Running on threads:" + str(threads) + "\n")

    sys.stderr.write("Debug Mode? " + str(__debug__) + " and threads " +
                     str(threads) + "\n")

    def printStuff(old, fileCoocs, env):

        printed = 0

        for cooc in fileCoocs:

            print("{pmid}\t{cl}\t{name}\t{pos}\n".format(pmid=cooc[0],
                                                         cl=cooc[1],
                                                         name=cooc[2],
                                                         pos=str(cooc[3])),
                  end='',
                  flush=True)

            printed += 1

        return printed

    ll = MapReduce(threads)
    result = ll.exec(allfileIDs, analyseFile, None, 1, None)

示例#3

0

显示文件

文件： createMIRTarBase.py 项目： mjoppich/miRExplore

            if not mirnaSpeciesID in dbcreatedMIRT2TAX[mirtarID]:
                dbcreatedMIRT2TAX[mirtarID].add(mirnaSpeciesID)
                procDB.createRelationship('mtb', ['MIRTARBASE'], {'id': mirtarID}, 'taxid', ['TAX'], {'id': mirnaSpeciesID}, ['ORGANISM_SUPPORT'], {})


        if not mirtarGENE in dbcreatedMIRT2GENE[mirtarID]:
            dbcreatedMIRT2GENE[mirtarID].add(mirtarGENE)
            procDB.createRelationship('gene', ['GENE'], {'id': mirtarGENE}, 'mtb', ['MIRTARBASE'], {'id': mirtarID}, ['GENE_MENTION'], {'tax': geneSpeciesID})

        if not mirtarMIRNA in dbcreatedMIRT2MIRNA[mirtarID]:
            dbcreatedMIRT2MIRNA[mirtarID].add(mirtarMIRNA)
            procDB.createRelationship('mtb', ['MIRTARBASE'], {'id': mirtarID}, 'mirna', ['MIRNA'],
                                             {'name': mirtarMIRNA}, ['MIRNA_MENTION'], {'tax': mirnaSpeciesID})
    procDB.close()

ll = MapReduce(6)

allMIRTIds = [x for x in allMIRTs]
allChunks = ll.chunkIterable(allMIRTIds, 1000)

workChunks = []
for chunk in allChunks:
    toAdd = []

    for MIRTid in chunk:
        evidences = allMIRTs[MIRTid]
        toAdd += evidences

    workChunks.append(toAdd)

result = ll.exec(workChunks , addMIRTs, None, 1, None)

示例#4

0

显示文件

文件： downloadPMCFulltexts.py 项目： mjoppich/miRExplore

            if not os.path.exists(saveFolderPath):
                os.makedirs(saveFolderPath)

            #print(ftp.pwd())
            ftp.cwd(folderPath)

            allFiles = ftp.nlst()

            print(folderPath, "Files", len(allFiles))

            for downloadFile in allFiles:
                fileURL = "ftp://" + ftpBase + "/" + pdfBase + "/" + commonFilePath + "/" + downloadFile
                filePATH = saveFolderPath + "/" + downloadFile
                toDownloadFiles.append((fileURL, filePATH))

                print("Adding", fileURL, filePATH)

            ftp.cwd("..")

            if len(toDownloadFiles) > 50:
                break
            #print(ftp.pwd())

        print("Downloading", len(toDownloadFiles), "Files")
        ftp.close()

        ll = MapReduce(2)
        result = ll.exec(toDownloadFiles, downloadFiles, None, 1, None)

    ftp.quit()

示例#5

0

显示文件

if __name__ == '__main__':

    ftpBase = 'ftp.ncbi.nlm.nih.gov'
    pdfBase = 'pub/pmc/oa_pdf'
    saveDirectory = "/mnt/raidtmpbio2/joppich/pmc_apr2020/"

    ftp = FTP(ftpBase)
    ftp.login()
    ftp.cwd(pdfBase)

    allFolders = []
    ftp.dir('-d', '*/', lambda L: allFolders.append(L.split()[-1]))

    ftp.quit()

    def downloadFiles(allPaths, env):

        for folder in allPaths:

            cmd = "bash -c \"mkdir -p {localFolder}; cd {localFolder}; lftp {ftpServer} -e 'cd {ftpBaseFolder}; mirror --only-newer . {localFolder}; exit'\"".format(
                localFolder=saveDirectory + "/" + folder,
                ftpServer=ftpBase,
                ftpBaseFolder=pdfBase + "/" + folder)

            print(cmd)
            os.system(cmd)

    ll = MapReduce(procs=16)
    result = ll.exec(allFolders, downloadFiles, None, 1, None)

示例#6

0

显示文件

文件： overviewMentioningPubMeds.py 项目： mjoppich/miRExplore

            if testMentioned(docGOHits, goSynIDs):
                subject2pmids['INFLAMM'].add(int(docID))

    return subject2pmids


finalResults = defaultdict(set)


def reduceSets(old, new, env):

    if old == None:
        return new

    for x in new:
        if x in old:
            old[x] = old[x] | new[x]

        else:
            old[x] = new[x]

    return old


ll = MapReduce(4)
result = ll.exec(allfileIDs, analyseFile, None, 1, reduceSets)

with open("/tmp/tm_soehnlein", 'w') as fout:
    for x in result:
        fout.write(str(x) + "\t" + str(result[x]) + "\n")

示例#7

0

显示文件

        fileID = "{:>4}".format(i).replace(" ", "0")
        downloadFile = medlineBase + fileID + ".xml.gz"
        print(downloadFile)

        if onlyNew:
            if os.path.exists(downloadLocation + "/" + downloadFile):
                continue

        request.urlretrieve(
            "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/" + downloadFile,
            downloadLocation + "/" + downloadFile)


if downloadBase:

    ll = MapReduce(8)
    ll.exec([i for i in range(1, updateStart)], downloadDataBase, None)

if downloadUpdatesParallel:

    ll = MapReduce(8)
    ll.exec([i for i in range(updateStart, updateEnd)], downloadDataUpdate,
            None)

if downloadUpdates:

    for i in range(updateStart, 10000):
        fileID = "{:>4}".format(i).replace(" ", "0")
        downloadFile = medlineBase + fileID + ".xml.gz"
        print(downloadFile)

示例#8

0

显示文件

                        continue

                    for author in authors:

                        first = author[0] if author[0] != None else ''
                        initials = author[1] if author[1] != None else ''
                        last = author[2] if author[2] != None else ''

                        outfile.write(
                            str(pmid) + "\t" +
                            "\t".join([first, initials, last]) + "\n")

            with open(storagePath + citationfile, 'w') as outfile:

                print(citationfile)

                for pmid in pmid2citations:
                    citations = pmid2citations[pmid]

                    if citations == None or len(citations) == 0:
                        continue

                    for quote in citations:

                        outfile.write(str(pmid) + "\t" + str(quote) + "\n")

    ll = MapReduce(8)
    result = ll.exec(allXMLFiles, senteniceFile, None, 1, None)

    print("Done")