pmid2types[entry.pmid].add(dtype) except: traceback.print_exc() eprint("Exception", datefile) try: pmid = elem.find('MedlineCitation/PMID').text eprint(pmid) except: pass continue for x in pmid2date: print(x, "\t".join([str(x) for x in pmid2date[x]]), sep="\t", file=outdate) for x in pmid2types: for doctype in pmid2types[x]: print(x, doctype, sep="\t", file=outtype) ll = MapReduce(6) result = ll.exec( allfiles, senteniceFile, None, 1, None) print("Done")
return None threads = 4 if __debug__: threads = 1 sys.stderr.write("Running on threads:" + str(threads) + "\n") sys.stderr.write("Debug Mode? " + str(__debug__) + " and threads " + str(threads) + "\n") def printStuff(old, fileCoocs, env): printed = 0 for cooc in fileCoocs: print("{pmid}\t{cl}\t{name}\t{pos}\n".format(pmid=cooc[0], cl=cooc[1], name=cooc[2], pos=str(cooc[3])), end='', flush=True) printed += 1 return printed ll = MapReduce(threads) result = ll.exec(allfileIDs, analyseFile, None, 1, None)
if not mirnaSpeciesID in dbcreatedMIRT2TAX[mirtarID]: dbcreatedMIRT2TAX[mirtarID].add(mirnaSpeciesID) procDB.createRelationship('mtb', ['MIRTARBASE'], {'id': mirtarID}, 'taxid', ['TAX'], {'id': mirnaSpeciesID}, ['ORGANISM_SUPPORT'], {}) if not mirtarGENE in dbcreatedMIRT2GENE[mirtarID]: dbcreatedMIRT2GENE[mirtarID].add(mirtarGENE) procDB.createRelationship('gene', ['GENE'], {'id': mirtarGENE}, 'mtb', ['MIRTARBASE'], {'id': mirtarID}, ['GENE_MENTION'], {'tax': geneSpeciesID}) if not mirtarMIRNA in dbcreatedMIRT2MIRNA[mirtarID]: dbcreatedMIRT2MIRNA[mirtarID].add(mirtarMIRNA) procDB.createRelationship('mtb', ['MIRTARBASE'], {'id': mirtarID}, 'mirna', ['MIRNA'], {'name': mirtarMIRNA}, ['MIRNA_MENTION'], {'tax': mirnaSpeciesID}) procDB.close() ll = MapReduce(6) allMIRTIds = [x for x in allMIRTs] allChunks = ll.chunkIterable(allMIRTIds, 1000) workChunks = [] for chunk in allChunks: toAdd = [] for MIRTid in chunk: evidences = allMIRTs[MIRTid] toAdd += evidences workChunks.append(toAdd) result = ll.exec(workChunks , addMIRTs, None, 1, None)
if not os.path.exists(saveFolderPath): os.makedirs(saveFolderPath) #print(ftp.pwd()) ftp.cwd(folderPath) allFiles = ftp.nlst() print(folderPath, "Files", len(allFiles)) for downloadFile in allFiles: fileURL = "ftp://" + ftpBase + "/" + pdfBase + "/" + commonFilePath + "/" + downloadFile filePATH = saveFolderPath + "/" + downloadFile toDownloadFiles.append((fileURL, filePATH)) print("Adding", fileURL, filePATH) ftp.cwd("..") if len(toDownloadFiles) > 50: break #print(ftp.pwd()) print("Downloading", len(toDownloadFiles), "Files") ftp.close() ll = MapReduce(2) result = ll.exec(toDownloadFiles, downloadFiles, None, 1, None) ftp.quit()
if __name__ == '__main__': ftpBase = 'ftp.ncbi.nlm.nih.gov' pdfBase = 'pub/pmc/oa_pdf' saveDirectory = "/mnt/raidtmpbio2/joppich/pmc_apr2020/" ftp = FTP(ftpBase) ftp.login() ftp.cwd(pdfBase) allFolders = [] ftp.dir('-d', '*/', lambda L: allFolders.append(L.split()[-1])) ftp.quit() def downloadFiles(allPaths, env): for folder in allPaths: cmd = "bash -c \"mkdir -p {localFolder}; cd {localFolder}; lftp {ftpServer} -e 'cd {ftpBaseFolder}; mirror --only-newer . {localFolder}; exit'\"".format( localFolder=saveDirectory + "/" + folder, ftpServer=ftpBase, ftpBaseFolder=pdfBase + "/" + folder) print(cmd) os.system(cmd) ll = MapReduce(procs=16) result = ll.exec(allFolders, downloadFiles, None, 1, None)
if testMentioned(docGOHits, goSynIDs): subject2pmids['INFLAMM'].add(int(docID)) return subject2pmids finalResults = defaultdict(set) def reduceSets(old, new, env): if old == None: return new for x in new: if x in old: old[x] = old[x] | new[x] else: old[x] = new[x] return old ll = MapReduce(4) result = ll.exec(allfileIDs, analyseFile, None, 1, reduceSets) with open("/tmp/tm_soehnlein", 'w') as fout: for x in result: fout.write(str(x) + "\t" + str(result[x]) + "\n")
fileID = "{:>4}".format(i).replace(" ", "0") downloadFile = medlineBase + fileID + ".xml.gz" print(downloadFile) if onlyNew: if os.path.exists(downloadLocation + "/" + downloadFile): continue request.urlretrieve( "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/" + downloadFile, downloadLocation + "/" + downloadFile) if downloadBase: ll = MapReduce(8) ll.exec([i for i in range(1, updateStart)], downloadDataBase, None) if downloadUpdatesParallel: ll = MapReduce(8) ll.exec([i for i in range(updateStart, updateEnd)], downloadDataUpdate, None) if downloadUpdates: for i in range(updateStart, 10000): fileID = "{:>4}".format(i).replace(" ", "0") downloadFile = medlineBase + fileID + ".xml.gz" print(downloadFile)
continue for author in authors: first = author[0] if author[0] != None else '' initials = author[1] if author[1] != None else '' last = author[2] if author[2] != None else '' outfile.write( str(pmid) + "\t" + "\t".join([first, initials, last]) + "\n") with open(storagePath + citationfile, 'w') as outfile: print(citationfile) for pmid in pmid2citations: citations = pmid2citations[pmid] if citations == None or len(citations) == 0: continue for quote in citations: outfile.write(str(pmid) + "\t" + str(quote) + "\n") ll = MapReduce(8) result = ll.exec(allXMLFiles, senteniceFile, None, 1, None) print("Done")