Exemplo n.º 1
0
def expData(inDirs, pmidListFname, outBase):
    logging.info("Reading %s" % pmidListFname)
    pmids = set([int(x) for x in tabfile.slurplist(pmidListFname)])
    logging.info("Read %d pmids" % len(pmids))
    posFname = outBase+".pos.tab"
    negFname = outBase+".neg.tab"
    posFh = codecs.open(posFname, "w", encoding="utf8")
    negFh = codecs.open(negFname, "w", encoding="utf8")
    
    for dataDir in inDirs:
        logging.debug(dataDir)
        for article, fileList in pubStore.iterArticleDirList(dataDir):
            if article.pmid=="":
                continue
            if int(article.pmid) in pmids:
                ofh = posFh
                txtClass = "pos"
            else:
                ofh = negFh
                txtClass = "neg"

            for fileData in fileList:
                if fileData.fileType=="main":
                    text = fileData.content
                    text = text.replace("\a", " ")
                    ofh.write("%s\t%s\t%s\n" % (article.pmid, txtClass, text))
Exemplo n.º 2
0
def expData(inDirs, pmidListFname, outBase):
    logging.info("Reading %s" % pmidListFname)
    pmids = set([int(x) for x in tabfile.slurplist(pmidListFname)])
    logging.info("Read %d pmids" % len(pmids))
    posFname = outBase + ".pos.tab"
    negFname = outBase + ".neg.tab"
    posFh = codecs.open(posFname, "w", encoding="utf8")
    negFh = codecs.open(negFname, "w", encoding="utf8")

    for dataDir in inDirs:
        logging.debug(dataDir)
        for article, fileList in pubStore.iterArticleDirList(dataDir):
            if article.pmid == "":
                continue
            if int(article.pmid) in pmids:
                ofh = posFh
                txtClass = "pos"
            else:
                ofh = negFh
                txtClass = "neg"

            for fileData in fileList:
                if fileData.fileType == "main":
                    text = fileData.content
                    text = text.replace("\a", " ")
                    ofh.write("%s\t%s\t%s\n" % (article.pmid, txtClass, text))
Exemplo n.º 3
0
 def paramFile(self, param, value, cgiMode=False):
     """ return value OR contents of value (name-of-file) if conditions are met to read in the file """
     if value!="":
         if (param.ptype.lower()=="textlinesarg" and not cgiMode) or param.ptype.lower()=="textlines":
             if param.textFormat=="":
                 return tabfile.slurplist(value)
             elif param.textFormat=="dictlist":
                 return tabfile.slurpdictlist(value)
             elif param.textFormat=="dict":
                 return tabfile.slurpdict(value)
         else:
             return value
     else:
         return value
Exemplo n.º 4
0
def startup(paramDict, resultDict):
    global doiSet
    doiFname = join(dirname(__file__), "data/doi.tab")
    doiSet = set(tabfile.slurplist(doiFname))
    for doi in doiSet:
        resultDict[doi]=set()
Exemplo n.º 5
0
def tabToFasta(inFilename, outFilename, linnaeusFilename, genomesFilename, numArtIdField=0, seqIdField=1, artIdField=2, seqField=5, maxTotalSeqLen=1000000, maxSeqLen=100000, maxSeqCount=100):
    """ convert tab-sep file from fttools to fasta file, linnaeusFile has docId<tab>taxonId format and genomesFilename has <genomeId> format """

    if linnaeusFilename!=None:
        logging.debug("Reading LINNAEUS files")
        docToTaxon      = tabfile.slurpdictset(linnaeusFilename)
        logging.debug("Reading sequenced genome list")
        sequencedTaxons = set(tabfile.slurplist(genomesFilename))
        taxonFileObjCache = {}

        logging.debug("Filtering LINNAEUS data by sequenced genome list")
        # filter the taxonIds by sequenced genome Ids
        filteredDocToTaxon = {}
        allSequencedTaxonIds = set()

        for docId, taxonIds in docToTaxon.iteritems():
            filtTaxons = set()
            for taxonId in taxonIds:
                if taxonId in sequencedTaxons:
                    filtTaxons.add(taxonId)
                    allSequencedTaxonIds.add(taxonId)
            filteredDocToTaxon[docId]=filtTaxons

        docToTaxon = {} # no need to waste memory with this
        logging.debug("Got data for %d sequenced genomes" % len(allSequencedTaxonIds))

        # open filehandles for these
        taxonToFile = {}
        for taxonId in allSequencedTaxonIds:
            taxonToFile[taxonId] = open(outFilename+"."+str(taxonId)+".fa", "w")

    else:
        docToTaxon=None
        outFileObj = open(outFilename, "w")

    colCount = len(open(inFilename).readline().split("\t"))

    br = maxTables.BlockReader(open(inFilename), 0, mustBeSorted=False)

    for artId, block in br.readNext():
        sequences = []
        seqSet = set()
        internalId = None
        externalId = None
        for fs in block:
            internalId = fs[numArtIdField]
            externalId = fs[artIdField]
            seq = fs[seqField]
            seqId = fs[seqIdField]

            letterCount = len(set(seq))
            if letterCount<=2:
                logging.debug("Skipping sequence %s of article %s, not more than 2 letters in string: %s" % (seqId, externalId, seq))
            elif seq in seqSet:
                logging.debug("Skipping sequence %s of article %s, already seen before: %s" % (seqId, externalId, seq))
            elif len(seq)>maxSeqLen:
                logging.debug("Skipping sequence %s of article %s; longer than %d bp: %s" % (seqId, externalId, maxSeqLen, seq))
            else:
                sequences.append ( (seqId, seq) )
                seqSet.add(seq)

        seqCount = len(sequences)
        totalSeqLen = sum([len(y) for x,y in sequences])

        if totalSeqLen > maxTotalSeqLen:
            logging.debug("Skipping article %s, total sequence length %d > %d" % (externalId, totalSeqLen, maxTotalSeqLen))
        elif seqCount>maxSeqCount:
            logging.debug("Skipping article %s, total sequence count %d > %d" % (externalId, seqCount, maxSeqCount))
        else:
            for seqId, seq in sequences:
                if docToTaxon==None:
                    outFileObj.write(">%s|%s\n%s\n" % (internalId, seqId, seq))
                else:
                    taxonIds = filteredDocToTaxon.get(internalId, None)
                    #logging.debug("TaxonIds are %s" % str(taxonIds))
                    if taxonIds==None:
                        taxonIds = allSequencedTaxonIds
                    for taxonId in taxonIds:
                        outFileObj = taxonToFile[taxonId]
                        outFileObj.write(">%s|%s\n%s\n" % (internalId, seqId, seq))
Exemplo n.º 6
0
def tabToFasta(inFilename,
               outFilename,
               linnaeusFilename,
               genomesFilename,
               numArtIdField=0,
               seqIdField=1,
               artIdField=2,
               seqField=5,
               maxTotalSeqLen=1000000,
               maxSeqLen=100000,
               maxSeqCount=100):
    """ convert tab-sep file from fttools to fasta file, linnaeusFile has docId<tab>taxonId format and genomesFilename has <genomeId> format """

    if linnaeusFilename != None:
        logging.debug("Reading LINNAEUS files")
        docToTaxon = tabfile.slurpdictset(linnaeusFilename)
        logging.debug("Reading sequenced genome list")
        sequencedTaxons = set(tabfile.slurplist(genomesFilename))
        taxonFileObjCache = {}

        logging.debug("Filtering LINNAEUS data by sequenced genome list")
        # filter the taxonIds by sequenced genome Ids
        filteredDocToTaxon = {}
        allSequencedTaxonIds = set()

        for docId, taxonIds in docToTaxon.iteritems():
            filtTaxons = set()
            for taxonId in taxonIds:
                if taxonId in sequencedTaxons:
                    filtTaxons.add(taxonId)
                    allSequencedTaxonIds.add(taxonId)
            filteredDocToTaxon[docId] = filtTaxons

        docToTaxon = {}  # no need to waste memory with this
        logging.debug("Got data for %d sequenced genomes" %
                      len(allSequencedTaxonIds))

        # open filehandles for these
        taxonToFile = {}
        for taxonId in allSequencedTaxonIds:
            taxonToFile[taxonId] = open(
                outFilename + "." + str(taxonId) + ".fa", "w")

    else:
        docToTaxon = None
        outFileObj = open(outFilename, "w")

    colCount = len(open(inFilename).readline().split("\t"))

    br = maxTables.BlockReader(open(inFilename), 0, mustBeSorted=False)

    for artId, block in br.readNext():
        sequences = []
        seqSet = set()
        internalId = None
        externalId = None
        for fs in block:
            internalId = fs[numArtIdField]
            externalId = fs[artIdField]
            seq = fs[seqField]
            seqId = fs[seqIdField]

            letterCount = len(set(seq))
            if letterCount <= 2:
                logging.debug(
                    "Skipping sequence %s of article %s, not more than 2 letters in string: %s"
                    % (seqId, externalId, seq))
            elif seq in seqSet:
                logging.debug(
                    "Skipping sequence %s of article %s, already seen before: %s"
                    % (seqId, externalId, seq))
            elif len(seq) > maxSeqLen:
                logging.debug(
                    "Skipping sequence %s of article %s; longer than %d bp: %s"
                    % (seqId, externalId, maxSeqLen, seq))
            else:
                sequences.append((seqId, seq))
                seqSet.add(seq)

        seqCount = len(sequences)
        totalSeqLen = sum([len(y) for x, y in sequences])

        if totalSeqLen > maxTotalSeqLen:
            logging.debug(
                "Skipping article %s, total sequence length %d > %d" %
                (externalId, totalSeqLen, maxTotalSeqLen))
        elif seqCount > maxSeqCount:
            logging.debug("Skipping article %s, total sequence count %d > %d" %
                          (externalId, seqCount, maxSeqCount))
        else:
            for seqId, seq in sequences:
                if docToTaxon == None:
                    outFileObj.write(">%s|%s\n%s\n" % (internalId, seqId, seq))
                else:
                    taxonIds = filteredDocToTaxon.get(internalId, None)
                    #logging.debug("TaxonIds are %s" % str(taxonIds))
                    if taxonIds == None:
                        taxonIds = allSequencedTaxonIds
                    for taxonId in taxonIds:
                        outFileObj = taxonToFile[taxonId]
                        outFileObj.write(">%s|%s\n%s\n" %
                                         (internalId, seqId, seq))
Exemplo n.º 7
0
def startup(paramDict, resultDict):
    global doiSet
    doiFname = join(dirname(__file__), "data/doi.tab")
    doiSet = set(tabfile.slurplist(doiFname))
    for doi in doiSet:
        resultDict[doi] = set()