Exemplo n.º 1
0
def test1():

    config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS')

    databaseFile = os.path.normpath(config.get('databaseFile'))
    taxonomicRanks = config.get('taxonomicRanks').split(',')
    t = Taxonomy(databaseFile, taxonomicRanks)

    s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron (226186), Porphyromonas gingivalis (242619) and Parabacteroides distasonis (435591).'
    #s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron , Porphyromonas gingivalis (242619) and Parabacteroides distasonis (435591).'
    #s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron , Porphyromonas gingivalis  and Parabacteroides distasonis (435591).'
    #s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron (226186) and Parabacteroides distasonis (435591).'
    #s = 'Assignment of query to the lowest common ancestor of Halobacterium sp. (64091), Thermococcus kodakarensis (69014), Pyrococcus horikoshii (70601), Methanothermobacter thermautotrophicus (187420), Methanopyrus kandleri AV19 (190192), Methanosarcina mazei (192952), Archaeoglobus fulgidus (224325), Methanocaldococcus jannaschii (243232), Methanococcoides burtonii (259564), Methanococcus maripaludis S2 (267377), Haloarcula marismortui (272569), Methanospirillum hungatei (323259), Methanosphaera stadtmanae (339860), Natronomonas pharaonis (348780), Methanosaeta thermophila PT (349307), Candidatus methanoarchaeon RC1 (351160), Haloquadratum walsbyi (362976), Methanoculleus marisnigri JR1 (368407), Methanocorpusculum labreanum (410358) and Methanobrevibacter smithii (420247).'
    #s = 'Assignment of query to the lowest common ancestor of Thermococcus kodakarensis (69014) and Pyrococcus horikoshii (70601).'
    list = re.findall(r'\([0-9]+\)', s)
    list2 = []
    for i in list:
        str = re.sub(r'\(([0-9]+)\)', r'\1', i)
        list2.append(str)
        print str
    print '-------------------------'


    pathDict = t.getPathFromLowestCommonAncestorToRoot(list2)
    #pathDict = t.getPathToRoot(170187)

    for k in taxonomicRanks:
        if k not in pathDict:
            break
        n = pathDict[k]
        print n.ncbid, n.rank, n.name
Exemplo n.º 2
0
def _test():
    #s = Sequence(1, 'seqName','AATTGGCCC\n\rAAA\n')
    #print 'sequence name: ', s.seqName
    #print 'sequence:', s.getSeq()
    #print 'seqBp:', s.seqBp, '({0})'.format(len(s.seqCompressed))

    config = Config(
        open(
            os.path.normpath(
                'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg'
            )), 'pPPS')
    outputFileContigSubPattern = config.get('outputFileContigSubPattern')

    nameToIDsFile = os.path.normpath(
        'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.cToIds'
    )
    targetFile = os.path.normpath(
        'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.ids.out'
    )
    outFile = os.path.normpath(
        'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.pOUT'
    )

    replaceIdsWithNames(outputFileContigSubPattern, nameToIDsFile, targetFile,
                        outFile)
Exemplo n.º 3
0
def test3():
    config = Config(open(os.path.normpath('/Users/ivan/Documents/work/binning/tests/CowRumen/03/config.cfg')), 'pPPS')
    databaseFile = os.path.normpath(config.get('databaseFile'))
    taxonomicRanks = config.get('taxonomicRanks').split(',')
    t = Taxonomy(databaseFile, taxonomicRanks)
    parentNcbid = 1239 #Firmicutes
    sampleName = 'test_sample'
    rank = 'species'
    t.createNewOtuDBEntry(parentNcbid, sampleName, rank)
    t.close()
Exemplo n.º 4
0
def _test():
#s = Sequence(1, 'seqName','AATTGGCCC\n\rAAA\n')
#print 'sequence name: ', s.seqName
#print 'sequence:', s.getSeq()
#print 'seqBp:', s.seqBp, '({0})'.format(len(s.seqCompressed))

    config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS')
    outputFileContigSubPattern = config.get('outputFileContigSubPattern')

    nameToIDsFile = os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.cToIds')
    targetFile = os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.ids.out')
    outFile = os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.pOUT')

    replaceIdsWithNames(outputFileContigSubPattern, nameToIDsFile, targetFile, outFile)
Exemplo n.º 5
0
def newTaxonId():

    parser = argparse.ArgumentParser(description='Gets a new taxon ID',
                                     epilog='Note that the use of this functionality alter the taxonomy file')

    parser.add_argument('-c', '--config', nargs=1, type=file, required=True,
                        help='Configuration file, the taxonomy file in this configuration file will be changed',
                        metavar='config.cfg', dest='c')

    parser.add_argument('-p', '--parent', nargs=1, required=True,
                        help='The parent NCBI taxon id of the new taxon id.', metavar='parent', dest='p')

    parser.add_argument('-r', '--rank', nargs=1, required=True,
                        help='Rank of the new taxon ID', metavar='rank', dest='r')

    parser.add_argument('-n', '--name_suffix', nargs=1, required=True,
                        help='Scientific name suffix of the new taxon ID', metavar='name', dest='n')

    args = parser.parse_args()

    if len(args.c) != 1 or len(args.p) != 1 or len(args.r) != 1 or len(args.n) != 1:
        print parser.print_help()
    configFile = args.c[0].name
    try:
        parent = int(args.p[0])
    except:
        print("The parent taxonomic id must be a number")
        return
    rank = args.r[0]
    name = args.n[0]
    if len(name) == 0:
        print("The scientific name cannot be empty!")
        return

    config = Config(open(os.path.normpath(configFile)), 'PhyloPythiaS_Plus')
    databaseFile = os.path.join(os.path.normpath(config.get('databaseFile')), 'ncbitax_sqlite.db')
    print databaseFile
    taxonomicRanks = taxonomy_ncbi.TAXONOMIC_RANKS[1:]
    if rank not in taxonomicRanks:
        print("Allowed ranks are only: %s" % taxonomicRanks)
        return
    t = Taxonomy(databaseFile, taxonomicRanks)

    newId = t.createNewOtuDBEntry(parent, name, rank)
    print('New taxonomic id: "%s"; with name suffix "%s"; at rank "%s"; '
          'as a descendant of "%s" has been created in "%s"'
          % (newId, name, rank, parent, databaseFile))
    t.close()
    return newId
Exemplo n.º 6
0
def test2():
    config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS')

    databaseFile = os.path.normpath(config.get('databaseFile'))
    taxonomicRanks = config.get('taxonomicRanks').split(',')
    t = Taxonomy(databaseFile, taxonomicRanks)

    taxPathDictList = []
    taxPathDictList.append(t.getPathToRoot(33958))#Lactobacillaceae
    taxPathDictList.append(t.getPathToRoot(91061))#Bacilli
    taxPathDictList.append(t.getPathToRoot(2))#Bacteria
    #taxPathDictList.append(t.getPathToRoot(1385))#Bacillales
    taxPathDictList.append(t.getPathToRoot(1578))#Lactobacilus
    #taxPathDictList.append(t.getPathToRoot(31979))#Clostridiaceae
    taxPathDictList.append(t.getPathToRoot(2))#Bacteria

    taxPathDict = t.getLongestCommonPathFromMultipleAssignments(taxPathDictList)

    for key in taxPathDict:
        print key, taxPathDict[key]
Exemplo n.º 7
0
def test():
    config = Config(
        open(
            '/Users/ivan/Documents/work/binning/tests/CowRumen/03/config.cfg'),
        'pPPS')
    mgWorkingDir = '/Users/ivan/Documents/work/binning/tests/CowRumen/03/working/mgWorking'
    s16Prefix = '/Users/ivan/Documents/work/binning/tests/CowRumen/03/working/cow_rumen_fragmented_velvet_assembly_scaffolds.fas.ids'
    clust = MGCluster(config, mgWorkingDir, s16Prefix)
    clust.preprocess(align=False, dm=False, cluster=False, readData=True)
    #clust.buildSpecificPred()

    clust.reconstructOTU()
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(
        description=
        '''A simple Metagenome Illumina read simulator that wraps pIRS''',
        epilog='''''')

    parser.add_argument('-c',
                        '--config',
                        nargs=1,
                        type=file,
                        required=True,
                        help='configuration file of the simulator',
                        metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument(
        '-p',
        '--pIRS-param',
        action='store',
        nargs='+',
        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(
        config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam, '-m') != 0
            or string.count(pirsParam, '-v') != 0
            or string.count(pirsParam, '-l') != 0
            or string.count(pirsParam, '-x') != 0
            or string.count(pirsParam, '-i') != 0
            or string.count(pirsParam, '-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir, 'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' +
                  str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment='#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir, 'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir, 'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(
            seqNameToFreq[seqName][0]) * coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir, str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(
            os.path.join(pirsInstallDir, 'pirs') + ' simulate -i ' +
            fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) +
            ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' +
            seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(
            cmd, shell=True, bufsize=-1,
            cwd=tmpDir)  # stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(
            str(
                file1Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads1.close()

        reads2 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(
            str(
                file2Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()
Exemplo n.º 9
0
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(description='''A simple Metagenome Illumina read simulator that wraps pIRS''',
                                 epilog='''''')

    parser.add_argument('-c', '--config', nargs=1, type=file, required=True,
                        help='configuration file of the simulator', metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument('-p', '--pIRS-param', action='store', nargs='+',
                        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
                        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam,'-m') != 0 or string.count(pirsParam,'-v') != 0 or string.count(pirsParam,'-l') != 0
        or string.count(pirsParam,'-x') != 0 or string.count(pirsParam,'-i') != 0 or string.count(pirsParam,'-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir,'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' + str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment = '#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir,'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir,'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(seqNameToFreq[seqName][0])*coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir,str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(os.path.join(pirsInstallDir,'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) +
                  ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength)
                  + ' -o ' + seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=tmpDir)# stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(str(file1Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads1.close()

        reads2 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(str(file2Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()
Exemplo n.º 10
0
                        entry += str('\t' + taxPathDict[rank].name)
                    else:
                        entry += '\t'
                f.write(entry)
        except Exception:
            print "Cannot create a file or write to it:", outFile
            raise
        finally:
            f.close()

if __name__ == "__main__":
    #test 2
    #ppsOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_contigs.txt'
    #outPPOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_PP_contigs.txt'
    #ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP'
    ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP'
    #outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP.PP.out'
    outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP.PP.out'


    config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS')
    databaseFile = os.path.normpath(config.get('databaseFile'))
    taxonomicRanks = config.get('taxonomicRanks').split(',')
    taxonomy = Taxonomy(databaseFile, taxonomicRanks)
    ppsOutToPPOut(ppsOutFile, outPPOutFile, taxonomicRanks, taxonomy)

    #test 1
    #scafContigFile = 'D:/A_Phylo/A_Metagenomic/reindeer/data/scaffolds-contigs.tab'
    #scafPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out'
    #contigPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out_contigs'
    #scafToContigOutput(scafContigFile, scafPPSOutFile, contigPPSOutFile)
Exemplo n.º 11
0
def test(ncbid):
    config = Config(open(os.path.normpath('D://A_Phylo//A_Metagenomic//pPPS//workspace//pPPS//config01.cfg')), 'pPPS')
    databaseFile = os.path.normpath(config.get('databaseFile'))
    ncbiProcessDir = os.path.normpath('D://A_Phylo//A_Metagenomic//pPPS//workspace//pPPS//wdir02//ncbiProcDir')
    dbData = DBData(ncbiProcessDir, databaseFile)
    threshold = 3
    print dbData.getGenomeWgsCount(ncbid, threshold)

    #config = Config(open(os.path.normpath('//AM//metagenomic//work//projects//pPPS//tests//TW//TW01//config.cfg')), 'pPPS')

    #threshold = 3
    #dir = 'D://A_Phylo//A_Metagenomic//pPPS//workspace//pPPS//genomes'
    #dir = '//AM//metagenomic//work//projects//pPPS//tests//TW//TW01//ncbiProcDir'

    #databaseFile = os.path.normpath(config.get('databaseFile'))
    #taxonomicRanks = config.get('taxonomicRanks').split(',')

    #count = getGenomeWgsCount(ncbid, threshold, dir, databaseFile, taxonomicRanks)

    #print count, 'genomes/wgs for ncbid:', ncbid


#if __name__ == "__main__":
  #test(122)
  #haveData(126)
  #haveData(84999) #Coriobacteriales
  #haveData(171549) #Bacteroidales
  #haveData(815) #Bacteriodaceae
  #haveData(171551) #Porphyromonadaceae
  #haveData(171552) #Prevotellaceae
  #haveData(171550) #Rikenellaceae
  ###test(976) #Bacteroidetes
  #haveData(200666) #Sphingobacteriales
  #haveData(768503) #Cytophagia
  #haveData(117743) #Flavobacteria
  #haveData(475963) #Caldilineales
  #haveData(292625) #Anaerolineae
  #haveData(200795) #Chloroflexi
  #haveData(204431) #Fibrobacteraceae (59374, 834)
  #haveData(186803) #Lachnospiraceae
  #haveData(541000) #Ruminococcaceae
  #haveData(186802) #Clostridiales
  #haveData(31979) #Clostridiaceae
  #haveData(186806) #Eubacteriaceae
  #haveData(186807) #Peptococcaceae
  #haveData(31977) #Veillonellaceae
  #haveData(186801) #Clostridia
  #haveData(128827) #Erysipelotrichaceae
  #haveData(1239) #Firmicutes
  #haveData(91061) #Bacilli
  #haveData(255528) #Victivallaceae (340101)
  #haveData(126) #Planctomycetaceae
  #haveData(481) #Neisseriaceae
  #haveData(213121) #Desulfobulbaceae (577650, 177439, 589865)
  #haveData(213421) #Desulfuromonaceae
  #haveData(69541) #Desulfuromonadales
  #haveData(72294) #Campylobacteraceae
  #haveData(1224) #Proteobacteria
  #haveData(28211) #Alphaproteobacteria
  #haveData(1236) #Gammaproteobacteria
  #haveData(137) #Spirochaetaceae
  #haveData(186333) #Anaeroplasmataceae
  #haveData(186332) #Anaeroplasmatales
  #haveData(31969) #Mollicutes

  #haveData(278082) #Victivallales
  #haveData(256845) #Lentisphaerae