Exemplo n.º 1
0
def mapReduceTestRun(datasets, alg, paramDict, tmpDir, updateIds=None, skipMap=False, keepOutFile=False):
    " do a map reduce run only on one random file, no cluster submission, for testing "
    if updateIds!=None and len(updateIds)!=0:
        updateId = updateIds[0]
    else:
        updateId = None
    baseNames = findArticleBasenames(datasets[0], updateId)
    firstBasename = baseNames.pop()
    oneInputFile = firstBasename+".articles.gz"
    if not isfile(oneInputFile):
        oneInputFile = firstBasename+".files.gz"
    logging.info("Testing algorithm on file %s" % oneInputFile)
    reader = pubStore.PubReaderFile(oneInputFile)
    tmpAlgOut = join(tmpDir, "pubMapReduceTest.temp.marshal.gz")
    tmpRedOut = join(tmpDir, "red.temp.tab")
    if not skipMap:
        runMap(reader, alg, paramDict, tmpAlgOut)
    if "combine" in dir(alg):
        runCombine(tmpAlgOut, alg, paramDict, tmpAlgOut)
    runReduce(alg, paramDict, tmpAlgOut, tmpRedOut, quiet=True)

    ifh = open(tmpRedOut)
    logging.info("Example reducer output")
    for i in range(0, 50):
        line = ifh.readline()
        line = line.strip()
        logging.info(line)
    os.remove(tmpAlgOut)
    if keepOutFile:
        logging.info("test output written to file %s, file not deleted" % tmpRedOut)
    else:
        logging.info("Waiting for 5 secs")
        time.sleep(5)
        os.remove(tmpRedOut)
Exemplo n.º 2
0
def filterOneChunk(inFname, searchSpec, outFname):
    """ 
    filter one chunk. searchSpec can be a list of keywords (e.g. ebola,filovirus) or 
    a filename of a list of PMIDs.
    """
    logging.debug("filtering %s" % inFname)
    pmids = None
    if isfile(searchSpec):
        pmids = set([int(l.strip()) for l in open(searchSpec)])
    else:
        words = searchSpec.split(",")
        words = [w.lower() for w in words]

    reader = pubStore.PubReaderFile(inFname)
    store = pubStore.PubWriterFile(outFname)

    for article, files in reader.iterArticlesFileList(None):
        # this is the filtering part: continue if article is not accepted
        if pmids != None:
            if (article.pmid == "" or int(article.pmid) not in pmids):
                logging.debug("skipping %s, no PMID or not in filter file" %
                              article.pmid)
                continue
        else:
            foundMatch = False
            for w in words:
                for fileRow in files:
                    cont = fileRow.content.lower()
                    if w in cont:
                        foundMatch = True
                        break
                if foundMatch:
                    break

            if not foundMatch:
                continue

        # now write the article to output directory
        store.writeArticle(article.articleId, article._asdict())
        for fileRow in files:
            store.writeFile(article.articleId, fileRow.fileId,
                            fileRow._asdict())
    store.close()
Exemplo n.º 3
0
    pubGeneric.setupLogging(__file__, options)

    if len(args)==0:
        doctest.testmod()
        sys.exit(0)

    algName, algMethod, inName, outName, paramFile = args

    binData = gzip.open(paramFile, "rb").read()
    paramDict = marshal.loads(binData)
    for key, val in paramDict.iteritems():
        logging.log(5, "parameter %s = %s" % (key, str(val)))

    alg = getAlg(algName, defClass=string.capitalize(algMethod))

    if algMethod in ["combine", "processRow"]:
        # methods that don't work on text input
        if algMethod=="processRow":
            runProcessRow(inName, alg, paramDict, outName)
        elif algMethod=="combine":
            runCombine(inName, alg, paramDict, outName)
    else:
        reader = pubStore.PubReaderFile(inName)
        if algMethod=="map":
            runMap(reader, alg, paramDict, outName)
        elif algMethod=="annotate":
            runAnnotate(reader, alg, paramDict, outName)
        elif algMethod=="annotateWrite":
            runAnnotateWrite(reader, alg, paramDict, outName)
        reader.close()