Exemplo n.º 1
0
def runIndexerCLITool(args):
    "Wrapper that runs the indexer using an argparse args object"
    #
    # Initialize two connections: One for read/write operations and
    #  a separate one for high-volume push (indexing) operations, not supporting read operations
    # As ZMQ contexts are quite heavyweight, use a shared context for both.
    # The extra network connection does not incur a significant overhead, especially if
    #  use over unix domain socket connections.
    #
    import zmq
    context = zmq.Context()
    rwDB = DocumentDB.YakDBDocumentDatabase(mode="REQ", context=context)
    pushDB = DocumentDB.YakDBDocumentDatabase(mode="PUSH", context=context)
    #Initialize indexer
    indexer = TranslatronDocumentIndexer(rwDB, pushDB)
    #Iterate over documents
    didAnything = False
    if not args.no_documents:
        didAnything = True
        indexer.indexAllDocuments()
    if not args.no_entities:
        didAnything = True
        indexer.indexAllEntities()
    if args.statistics:
        didAnything = True
        indexer.printTokenFrequency()
    if not didAnything:
        print(
            "No indexer action specified, use --help to show available actions"
        )
Exemplo n.º 2
0
def runPMCImporterCLITool(args):
    #Open tables with REQ/REP connection
    DocumentDB.YakDBDocumentDatabase(mode="REQ")
    #Worker threads will have individual DB connections
    parser = PMCTARParser(numWorkers=args.workers)
    for infile in args.infile:
        if infile.endswith(".tar.gz"):
            parser.processPMCTarGZ(
                infile,
                filterStr=args.filter,
                contentFilterStr=args.content_filter.lower().encode("utf-8"))
        elif infile.endswith(".nxml") or infile.endswith(".xml"):
            parser.processPMCXML(infile)
Exemplo n.º 3
0
 def run(self):
     db = DocumentDB.YakDBDocumentDatabase(mode="PUSH")
     for data in iter(self.queue.get, None):
         #Convert XML string to document object
         doc = processPMCFileContent(data)
         if doc is None: continue  #Parse error
         self.writeQueue.append(doc)
         #Write if write queue size has been reached
         if len(self.writeQueue) >= 128:
             db.writeDocuments(self.writeQueue)
             self.writeQueue.clear()
     #Flush remaining
     if self.writeQueue:
         db.writeDocuments(self.writeQueue)
Exemplo n.º 4
0
def importMeSH(args, infile):
    db = DocumentDB.YakDBDocumentDatabase(mode="PUSH")
    # NOTE: MeSH 2015 contains only 27k entities
    batch = db.entityIdx.newWriteBatch(chunkSize=40000)
    print(green("Starting to import entities from %s" % infile))
    # Read file
    with open(infile, "r") as infile:
        writeStartTime = time.time()
        for mesh in readMeSH(infile):
            # Write entity to database
            batch.writeEntity(meshEntryToEntity(mesh))
            # Statistics
            if batch.numWrites % 5000 == 0:
                deltaT = time.time() - writeStartTime
                entityWriteRate = batch.numWrites / deltaT
                print("Wrote %d entities at %.1f e/s" %
                      (batch.numWrites, entityWriteRate))
    print("Wrote overall %d entities" % batch.numWrites)
Exemplo n.º 5
0
def importUniprot(args, infile):
    db = DocumentDB.YakDBDocumentDatabase(mode="PUSH")
    batch = db.entityIdx.newWriteBatch(chunkSize=25000)
    print(green("Starting to import entities from %s" % infile))
    # Read uniprot file, zcat is about 5-10 times faster and
    #  distributes load over multiple cores.
    p = subprocess.Popen(["zcat", infile], stdout=subprocess.PIPE)
    writeStartTime = time.time()
    for uniprot in readUniprot(p.stdout):
        # Write entity to database
        batch.writeEntity(uniprotEntryToEntity(uniprot))
        # Statistics
        if batch.numWrites % 10000 == 0:
            deltaT = time.time() - writeStartTime
            entityWriteRate = batch.numWrites / deltaT
            print("Wrote %d entities at %.1f e/s" %
                  (batch.numWrites, entityWriteRate))
    #Wait for subprocess to exit
    p.communicate()
    print("Wrote overall %d entities" % batch.numWrites)
def importWikimediaPagelist(args, infile):
    db = DocumentDB.YakDBDocumentDatabase(mode="PUSH")
    batch = db.entityIdx.newWriteBatch(chunkSize=100000)
    print(green("Starting to import entities from %s" % infile))
    writeStartTime = time.time()
    for (pageId, pageTitle) in readWikimediaFile(infile):
        # Write entity to database
        pageIdStr = pageId.decode("utf-8")
        batch.writeEntity({
            "id": "Wikipedia:" + pageIdStr,
            "name": pageTitle,
            "source": "Wikipedia",
            "type": "Encyclopedia entry",
            "ref": {
                "Wikipedia": [pageIdStr]
            },
        })
        # Statistics
        if batch.numWrites % 10000 == 0:
            deltaT = time.time() - writeStartTime
            entityWriteRate = batch.numWrites / deltaT
            print("Wrote %d entities at %.1f e/s" %
                  (batch.numWrites, entityWriteRate))
    print("Wrote overall %d entities" % batch.numWrites)
Exemplo n.º 7
0
#!/usr/bin/env python3
"""
Utility to traverse all Translatron entities and show a set of all databases

This is rarely useful and therefore not integrated into the main Translatron CLI
"""
from Translatron import DocumentDB
from collections import Counter

if __name__ == "__main__":
    db = DocumentDB.YakDBDocumentDatabase(mode="REQ")
    databases = Counter()
    for _, entity in db.iterateEntities():
        if b"ref" in entity:
            for db in entity[b"ref"].keys():
                databases[db] += 1
    for database, cnt in databases.items():
        print(database.decode("utf-8") + "," + str(cnt))