Пример #1
0
    def __call__(self, track, slice=None):

        fn = os.path.join(
            DATADIR,
            "replicated_intervals/%(track)s.peakshape.gz.matrix_%(slice)s.gz" %
            locals())
        if not os.path.exists(fn):
            return

        x = IOTools.openFile(fn)
        matrix, rownames, colnames = IOTools.readMatrix(x)

        nrows = len(rownames)
        if nrows == 0:
            return
        if nrows > self.scale:
            take = numpy.array(numpy.floor(
                numpy.arange(0, nrows,
                             float(nrows + 1) / self.scale)),
                               dtype=int)
            rownames = [rownames[x] for x in take]
            matrix = matrix[take]

        return odict(
            (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
Пример #2
0
def buildPFAMDomains( infiles, outfile ):
    '''map PFAM domains onto current sequence collection. 
    The mapping is done by ID lookup.'''
    
    infile = infiles[0]
    with IOTools.openFile( "nrdb50.fasta.tsv") as inf:

        reader = csv.DictReader( inf, dialect='excel-tab' )
        map_id2nid = {}
        for row in reader:
            map_id2nid[row['repid']] = row['nid']
    
    rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" )

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    with IOTools.openFile( infile ) as inf:
        for entry in FastaIterator.iterate( inf ):
            c.input += 1
            pid, start, end, pfam_id, description = rx.match( entry.title ).groups()
            try:
                outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) )
            except KeyError:
                c.missed += 1
                continue
            c.output += 1

    outf.close()
    E.info( c )
Пример #3
0
    def __call__(self, track, slice=None):
        fn = "ortholog_pairs_with_feature.matrix2"
        if not os.path.exists(fn):
            return

        x = IOTools.openFile(fn)
        matrix, rownames, colnames = IOTools.readMatrix(x)
        return odict((("matrix", matrix), ("rows", rownames), ("columns", colnames)))
Пример #4
0
    def __call__(self, track, slice=None):
        fn = "ortholog_pairs_with_feature.matrix2"
        if not os.path.exists(fn):
            return

        x = IOTools.openFile(fn)
        matrix, rownames, colnames = IOTools.readMatrix(x)
        return odict(
            (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
Пример #5
0
    def __call__(self, track, slice = None):
        
        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict( ( ("transcript", np.mean(c_transcript)), ("gene",np.mean(c_gene) )) )
Пример #6
0
def configToDictionary( config ):

    p = {}
    for section in config.sections():
        for key,value in config.items( section ):
            v = IOTools.convertValue( value )
            p["%s_%s" % (section,key)] = v
            if section in ( "general", "DEFAULT" ):
                p["%s" % (key)] = v
               
    for key, value in config.defaults().iteritems():
        p["%s" % (key)] =  IOTools.convertValue( value )
        
    return p
Пример #7
0
 def __call__(self, track, slice = None):
     
     if slice == "transcript":
         lengths_transcripts = []
         for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in transcript])
             lengths_transcripts.append(length)
         return np.mean(lengths_transcripts)
     
     elif slice == "gene":
         lengths_genes = []
         for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in gene])
             lengths_genes.append(length)
         return np.mean(lengths_genes)
Пример #8
0
def buildSummaryCalledDMRs( infiles, outfile ):
    '''build summary of differentially methylated regions.'''
    
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n" )

    for track in TRACKS:
        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track
                                            ).fetchall()]

        for table in tables:

            statement = """SELECT 
                         COUNT(*) as ntested, 
                         SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, 
                         SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, 
                         SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold 
                         FROM medip_%(track)s.%(table)s"""

            ntested, nok, nsignificant, n2fold = cc.execute( statement % locals() ).fetchone()

            outf.write( "\t".join( map(str, (track, table, ntested, nok, nsignificant, n2fold )))+ "\n" )

    outf.close()
Пример #9
0
def buildSummaryMapping( infiles, outfile ):
    
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile( outfile, "w" )
    
    table = "bam_stats"

    colnames = None
    for track in TRACKS:
        
        statement = """SELECT * 
                         FROM medip_%(track)s.%(table)s"""
        
        data = cc.execute( statement % locals() ).fetchall()
        _colnames = [x[0] for x in cc.description]
        if not colnames:
            colnames = _colnames
            outf.write( "\t".join( ["metatrack"] + colnames,) + "\n"  )

        assert colnames == _colnames

        for row in data:
            outf.write( "\t".join( map(str, (track,) + row))+ "\n" )

    outf.close()
Пример #10
0
def getParameters(filename="pipeline.ini"):
    '''read a config file and return as a dictionary.

    Sections and keys are combined with an underscore. If
    a key without section does not exist, it will be added 
    plain.

    For example::

       [general]
       input=input1.file

       [special]
       input=input2.file

    will be entered as { 'general_input' : "input1.file",
    'input: "input1.file", 'special_input' : "input2.file" }

    This function also updates the module-wide parameter map.
    
    '''
    p = {}

    config = ConfigParser.ConfigParser()
    config.readfp(open(filename), "r")

    for section in config.sections():
        for key, value in config.items(section):
            v = IOTools.convertValue(value)
            if key not in p: p[key] = v
            p["%s_%s" % (section, key)] = v

    PARAMS.update(p)

    return p
Пример #11
0
def buildSummaryCpGCoverage(infiles, outfile):
    '''build summary of differentially methylated regions.'''

    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n")

    for track in TRACKS:

        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and name LIKE '%%coveredpos%%' """ % track
                                            ).fetchall()]

        for table in tables:

            statement = """SELECT '%(track)s' as metatrack,
                         '%(table)s' as track,
                         coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s"""

            for x in cc.execute(statement % locals()):
                outf.write("\t".join(map(str, x)) + "\n")

    outf.close()
Пример #12
0
def buildSummaryCalledDMRs(infiles, outfile):
    '''build summary of differentially methylated regions.'''

    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n")

    for track in TRACKS:
        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track
                                            ).fetchall()]

        for table in tables:

            statement = """SELECT 
                         COUNT(*) as ntested, 
                         SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, 
                         SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, 
                         SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold 
                         FROM medip_%(track)s.%(table)s"""

            ntested, nok, nsignificant, n2fold = cc.execute(
                statement % locals()).fetchone()

            outf.write(
                "\t".join(map(str, (track, table, ntested, nok, nsignificant, n2fold))) + "\n")

    outf.close()
Пример #13
0
def buildSummaryMapping(infiles, outfile):

    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")

    table = "bam_stats"

    colnames = None
    for track in TRACKS:

        statement = """SELECT * 
                         FROM medip_%(track)s.%(table)s"""

        data = cc.execute(statement % locals()).fetchall()
        _colnames = [x[0] for x in cc.description]
        if not colnames:
            colnames = _colnames
            outf.write("\t".join(["metatrack"] + colnames,) + "\n")

        assert colnames == _colnames

        for row in data:
            outf.write("\t".join(map(str, (track,) + row)) + "\n")

    outf.close()
Пример #14
0
def buildSummaryCpGCoverage( infiles, outfile ):
    '''build summary of differentially methylated regions.'''
    
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile( outfile, "w" )
    outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n" )

    for track in TRACKS:

        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and name LIKE '%%coveredpos%%' """ % track
                                            ).fetchall()]
        

        for table in tables:
            
            statement = """SELECT '%(track)s' as metatrack,
                         '%(table)s' as track,
                         coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s"""

            for x in cc.execute(statement % locals()):
                outf.write( "\t".join(map(str,x))+ "\n" )

    outf.close()
Пример #15
0
def getParameters( filename = "pipeline.ini" ):
    '''read a config file and return as a dictionary.

    Sections and keys are combined with an underscore. If
    a key without section does not exist, it will be added 
    plain.

    For example::

       [general]
       input=input1.file

       [special]
       input=input2.file

    will be entered as { 'general_input' : "input1.file",
    'input: "input1.file", 'special_input' : "input2.file" }

    This function also updates the module-wide parameter map.
    
    '''
    p = {}
    
    config = ConfigParser.ConfigParser()
    config.readfp(open(filename),"r")

    for section in config.sections():
        for key,value in config.items( section ):
            v = IOTools.convertValue( value )
            if key not in p: p[key] = v
            p["%s_%s" % (section,key)] = v

    PARAMS.update( p )

    return p
Пример #16
0
    def __call__(self, track, slice = None):

        classes = ["antisense"
              , "antisense_upstream"
              , "antisense_downstream"
              , "sense_upstream"
              , "sense_downstream"
              , "intergenic" 
              , "sense_intronic" 
              , "antisense_intronic"]

        coding_set = {}
        for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")):
            coding_set[gtf.transcript_id] = gtf.source

        result = {"noncoding": {}, "coding":collections.defaultdict(int)}
        total_nc = float(self.getValue("SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'"))
        for c in classes:
            result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' 
                                                              AND b.C_NC = 'noncoding' 
                                                              AND a.transcript_id = b.transcript_id""" % (track,c)))/total_nc)*100

        
        total_c = len(coding_set.keys())
        for c in classes:
            ids = self.getValues("SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'")
            for i in ids:
                if i in coding_set.keys():
                    if coding_set[i] == c:
                        result["coding"][c] += 1
            
        for x, y in result["coding"].iteritems():
            result["coding"][x] = (float(y)/total_c)*100
            
        return result
Пример #17
0
    def __call__(self, track, slice = None):
        fn = os.path.join( DATADIR, "%(track)s.peakshape.tsv.gz.matrix_%(slice)s.gz" % locals() )
        if not os.path.exists( fn ): 
            return
        
        matrix, rownames, colnames = IOTools.readMatrix( IOTools.openFile( fn ))
        nrows = len(rownames)
        if nrows == 0: return

        if nrows > 1000:
            take = numpy.array( numpy.floor( numpy.arange( 0, nrows, nrows / 1000 ) ), dtype = int )
            rownames = [ rownames[x] for x in take ]
            matrix = matrix[ take ]
            
        return odict( (('matrix', matrix),
                       ('rows', rownames),
                       ('columns', colnames)) )
    def getReferenceLincRNA(self, reference_gtf):

        lincs = []
        for entry in GTF.iterator(IOTools.openFile(reference_gtf)):
            if entry.source == "lincRNA":
                if entry.gene_id not in lincs:
                    lincs.append(entry.gene_id)
        return len(lincs)
Пример #19
0
    def getReferenceLincRNA(self, reference_gtf):

        lincs = []
        for entry in GTF.iterator(IOTools.openFile(reference_gtf)):
            if entry.source == "lincRNA":
                if entry.gene_id not in lincs:
                    lincs.append(entry.gene_id)
        return len(lincs)
Пример #20
0
 def __call__(self,track, slice = None):
     
     transcript_counts = collections.defaultdict( set )
     counts = []
     for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
         transcript_counts[gtf.gene_id].add(gtf.transcript_id)
     for gene, transcripts in transcript_counts.iteritems():
         counts.append(len(transcripts))
     return counts
Пример #21
0
def checkBlastRuns( infiles, outfile ):
    '''check if output files are complete.
    '''
    
    outf = IOTools.openFile( outfile, "w" )

    outf.write( "chunkid\tquery_first\tquery_last\tfound_first\tfound_last\tfound_total\tfound_results\thas_finished\tattempts\t%s\n" %\
                    "\t".join(Logfile.RuntimeInformation._fields))

    for infile in infiles:
        E.debug( "processing %s" % infile)
        chunkid = P.snip( os.path.basename( infile ), ".blast.gz" )
        logfile = infile + ".log"
        chunkfile = P.snip( infile, ".blast.gz" ) + ".fasta"

        with IOTools.openFile( infile ) as inf:
            l = inf.readline()
            ids = set()
            total_results = 0
            for l in inf:
                if l.startswith("#//"): continue
                ids.add( int(l.split("\t")[0] ) )
                total_results += 1
            found_first = min(ids)
            found_last = max(ids)
            found_total = len(ids)

        l = IOTools.getFirstLine( chunkfile )
        query_first = l[1:-1]
        l2 = IOTools.getLastLine( chunkfile, nlines = 2).split("\n")
        query_last = l2[0][1:]

        logresults = Logfile.parse( logfile )
        
        outf.write( "\t".join( map(str, (\
                        chunkid, query_first, query_last,
                        found_first, found_last,
                        found_total, total_results,
                        logresults[-1].has_finished,
                        len(logresults),
                        "\t".join( map(str, logresults[-1]) ) ) ) ) + "\n" )
        
    outf.close()
Пример #22
0
 def __call__(self, track, slice = None):
     pattern = self.pattern
     fn = os.path.join( DATADIR, "liver_vs_testes/%(track)s%(pattern)s.matrix_%(slice)s.gz" % locals() )
     if not os.path.exists( fn ): 
         return
     
     x = IOTools.openFile( fn )
     matrix, rownames, colnames = IOTools.readMatrix( x )
     
     nrows = len(rownames)
     if nrows == 0: return
     if nrows > self.scale:
         take = numpy.array( numpy.floor( numpy.arange( 0, nrows, float(nrows + 1) / self.scale ) ), dtype = int )
         rownames = [ rownames[x] for x in take ]
         matrix = matrix[ take ]
         
     return odict( (('matrix', matrix),
                    ('rows', rownames),
                    ('columns', colnames)) )
Пример #23
0
    def __call__(self, track, slice = None):
        
        if slice == "transcript":
            lengths_transcripts = []
            for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in transcript])
                lengths_transcripts.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(lengths_transcripts, numbins=40, defaultreallimits=(0,20000))
            x = np.arange(counts.size) * dx + lower
            return odict( (("length", x), ("cumulative frequency", counts/len(lengths_transcripts))) )

        
        elif slice == "gene":
            lengths_genes = []
            for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in gene])
                lengths_genes.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(lengths_genes, numbins=40, defaultreallimits=(0,20000))
            x = np.arange(counts.size) * dx + lower
            return odict( (("length", x), ("cumulative frequency", counts/len(lengths_genes))) )
Пример #24
0
 def __call__(self,track, slice = None):
     
     transcript_counts = collections.defaultdict( set )
     counts = []
     for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
         transcript_counts[gtf.gene_id].add(gtf.transcript_id)
     for gene, transcripts in transcript_counts.iteritems():
         counts.append(len(transcripts))
     count, lower, dx, _ = scipy.stats.cumfreq(counts, numbins=40, defaultreallimits=(1,15))
     x = np.arange(count.size) * dx + lower
     return odict( (("transcript number", x), ("cumulative frequency", count/len(counts))) )
Пример #25
0
def buildNrdb50( infile, outfile ):
    '''build nrdb50
    
    Renumber seqences.'''
    
    outf_fasta = IOTools.openFile( outfile, "w" )
    outf_table = IOTools.openFile( outfile + ".tsv", "w" )
    outf_table.write("nid\tpid\thid\tdescription\tcluster_size\ttaxon\trepid\n" )

    rx = re.compile( "(\S+) (.*) n=(\d+) Tax=(.*) RepID=(\S+)" )

    nid = 1
    for entry in FastaIterator.iterate( IOTools.openFile( infile )):
        outf_fasta.write(">%i\n%s\n" % (nid, entry.sequence ) )
        cluster_name, description, cluster_size, taxon, repid = rx.match( entry.title ).groups()
        hid = computeHID( entry.sequence )
        outf_table.write( "\t".join( (str(nid), cluster_name, hid, description, cluster_size, taxon, repid)) + "\n" )
        nid += 1

    outf_fasta.close()
    outf_table.close()
Пример #26
0
def getNumColumns( filename ):
    '''return number of fields in bed-file by looking at the first 
    entry.
    
    Returns 0 if file is empty.
    '''
    with IOTools.openFile( filename ) as inf:
        for line in inf:
            if line.startswith("#"): continue
            if line.startswith("track"): continue
            return len(line[:-1].split("\t"))
    return 0
Пример #27
0
    def testMCL(self):
        tab_path = Path('./tab.txt')
        ival = 30
        pival = 18

        df_chr, sample_list = LoadInput.loadToPandas('testdata2.hdf',
                                                     'testdata2.tsv',
                                                     'A',
                                                     filter=False)
        sections = [[1, 2, 3, 4, 5, 6, 7, 8, 9]]
        pool = Pool(initializer=pc.mclInit,
                    initargs=(sample_list, tab_path, ival, pival))

        IOTools.writeTab(sample_list, 'tab.txt')
        clusters = pool.map(
            pc.mclWorker,
            [df_chr.loc[(slice(None), section), :] for section in sections],
            chunksize=10)
        expected = [[['A', 'B', 'C']]]

        assert clusters == expected
Пример #28
0
Файл: Bed.py Проект: yangjl/cgat
def getNumColumns(filename):
    '''return number of fields in bed-file by looking at the first 
    entry.
    
    Returns 0 if file is empty.
    '''
    with IOTools.openFile(filename) as inf:
        for line in inf:
            if line.startswith("#"): continue
            if line.startswith("track"): continue
            return len(line[:-1].split("\t"))
    return 0
Пример #29
0
def checkBlastRun( infiles, outfile ):
    '''build summary stats on file.'''

    pairsdbfile, seqfile = infiles
    
    nids = set()
    with IOTools.openFile( seqfile ) as inf:
        for r in FastaIterator.iterate( inf ):
            nids.add( int(r.title) )

    with IOTools.openFile( pairsdbfile ) as inf:
        query_ids, sbjct_ids = set(), set()
        total_results, self_links = 0, 0
        for l in inf:
            l = inf.readline()
            if l.startswith("#//"): continue
            query_id, sbjct_id = l.split("\t")[:2]
            query_ids.add( int(query_id) )
            sbjct_ids.add( int(sbjct_id) )
            if query_id == sbjct_id: self_links += 1
            total_results += 1

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "category\tcounts\n")
    outf.write( "\t".join( map(str, ('nids', len(nids)))) + "\n" )
    outf.write( "\t".join( map(str, ('links', total_results))) + "\n" )
    outf.write( "\t".join( map(str, ('self', self_links))) + "\n" )
    outf.write( "\t".join( map(str, ('queries', len(query_ids)))) + "\n" )
    outf.write( "\t".join( map(str, ('sbjcts', len(sbjct_ids)))) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_queries.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( query_ids )) ) )) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_sbjcts.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( sbjct_ids )) ) )) + "\n" )
    outf.close()
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
Пример #32
0
def buildPFAMFamilies( infiles, outfile ):

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "family\tshort\tdescription\n" )
    
    infile = infiles[1]
    family, description, short = None, None, None
    c = E.Counter()
    with IOTools.openFile( infile ) as inf:
        for line in inf:
            if line.startswith( "#=GF AC"):
                if family:
                    outf.write( "%s\n" % "\t".join( (family,description,short)))
                    c.output += 1
                family = re.match("#=GF AC\s+(\S+)", line[:-1]).groups()[0]
            elif line.startswith( "#=GF DE"):
                description = re.match("#=GF DE\s+(.+)",line[:-1]).groups()[0]
            elif line.startswith( "#=GF ID"):
                short = re.match("#=GF ID\s+(.+)",line[:-1]).groups()[0]
    outf.write( "%s\n" % "\t".join( (family,description,short)))
    c.outptut += 1
    outf.close()
    E.info(c)
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
Пример #35
0
def removeBlastUnfinished( infiles, outfile ):
    '''remove aborted blast runs.'''

    deleted = 0

    for infile in infiles:
        line = IOTools.getLastLine( infile )
        
        if not re.search( "job finished", line ):
            fn = infile[:-len(".log")]
            if os.path.exists( fn ):
                P.info("deleting %s" % fn )
                os.unlink( fn )
                deleted += 1

    P.info("deleted %i files" % deleted)
Пример #36
0
def buildMatrixFromTables( infiles, column, column_header = 0, dtype = numpy.float, default = None ):
    '''build a matrix from a column called *column* in a series of input files.
   
    If column_value == None, the first column is taken as the name of the row.

    The columns are given by order of the input files.

    returns matrix, row_headers
    '''
    
    lists = []
    for infile in infiles:
        data = pandas.read_table( IOTools.openFile(infile) )
        lists.append( zip( list( data[column_header] ), list(data[column]) ) )
        
    return buildMatrixFromLists( lists, dtype = dtype, default = default )
Пример #37
0
 def __call__(self, track):
     
     length = {}
     for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))):
         length[transcript[0].transcript_id] = sum([gtf.end - gtf.start for gtf in transcript])
     
     score = {}
     dbh = sqlite3.connect("csvdb")
     cc = dbh.cursor()
     for data in cc.execute("SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result"):
         score[data[0]] = data[1]
 
     result = {"length": [], "score": []}
     for transcript, value in length.iteritems():
         result["length"].append(np.log10(length[transcript]))
         result["score"].append(score[transcript])
     return result
def buildTrueTaxonomicRelativeAbundances(infile, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    The gi_taxid_nucl is a huge table and therefore this function
    takes an age to run - can think of optimising this somehow
    '''
    to_cluster = True

    total = 0
    rel_abundance = collections.defaultdict(int)
    for fastq in Fastq.iterate(IOTools.openFile(infile)):
        total += 1
        gi = fastq.identifier.split("|")[1]
        rel_abundance[gi] += 1
    for gi, ab in rel_abundance.items():
        rel_abundance[gi] = float(ab) / total

    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()
    result = collections.defaultdict(float)
    for gi in list(rel_abundance.keys()):
        E.info("processing gi %s" % gi)
        taxid = cc.execute(
            """SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" %
            gi).fetchone()[0]
        species_id = cc.execute(
            """SELECT species_id FROM categories WHERE taxid == '%s'""" %
            taxid).fetchone()[0]
        species_name = cc.execute(
            """SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'"""
            % species_id).fetchone()[0]
        abundance = rel_abundance[gi]
        E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" %
               (str(gi), str(taxid), str(species_id), species_name))
        result[species_name] += abundance

    outf = open(outfile, "w")
    outf.write("species_name\trelab\n")
    for species_name, abundance in result.items():
        # create names consistent with metaphlan
        species_name = species_name.replace(" ", "_")
        outf.write("%s\t%f\n" % (species_name, abundance))
    outf.close()
Пример #39
0
def buildMatrixFromTables(infiles,
                          column,
                          column_header=0,
                          dtype=numpy.float,
                          default=None):
    '''build a matrix from a column called *column* in a series of input files.
   
    If column_value == None, the first column is taken as the name of the row.

    The columns are given by order of the input files.

    returns matrix, row_headers
    '''

    lists = []
    for infile in infiles:
        data = pandas.read_table(IOTools.openFile(infile))
        lists.append(zip(list(data[column_header]), list(data[column])))

    return buildMatrixFromLists(lists, dtype=dtype, default=default)
Пример #40
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    coords_file=args[0]

    bamfile=pysam.Samfile( args[1], 'rb' )  # bamfile

    options.stdout.write( "gene_id\tcounts\tlength\n" )

    iter = Bed.iterator( IOTools.openFile( coords_file ) )
    for gene_id, exons in itertools.groupby( iter, lambda x: x.name ):

        num_reads=0
        
        anames=set([])
        lgene = 0

        for bed in exons:
            lgene += bed.end - bed.start
            for alignedread in bamfile.fetch(bed.contig, bed.start, bed.end):
                anames.add((alignedread.qname, alignedread.is_read1))

        num_reads = len(anames)
        options.stdout.write( "\t".join( (gene_id,
                                          str(num_reads),
                                          str(lgene ) )) + "\n" )

    ## write footer and output benchmark information.
    E.Stop()
Пример #41
0
def iterator_sorted( gff_iterator, sort_order = "gene" ):
    '''sort input and yield sorted output.'''
    entries = list(gff_iterator)
    if sort_order == "gene":
        entries.sort( key = lambda x: (x.gene_id, x.transcript_id, x.contig, x.start) )
    elif sort_order == "gene":
        entries.sort( key = lambda x: (x.gene_id, x.contig, x.start) )
    elif sort_order == "contig+gene":
        entries.sort( key = lambda x: (x.contig,x.gene_id,x.transcript_id,x.start) )
    elif sort_order == "transcript":
        entries.sort( key = lambda x: (x.transcript_id, x.contig, x.start) )
    elif sort_order == "position":
        entries.sort( key = lambda x: (x.contig, x.start) )
    elif sort_order == "position+gene":
        entries.sort( key = lambda x: (x.gene_id, x.start) )
        genes = list( flat_gene_iterator(entries) )
        genes.sort( key = lambda x: (x[0].contig, x[0].start) )
        entries = IOTools.flatten( genes )

    for entry in entries:
        yield entry
def buildTrueTaxonomicRelativeAbundances(infile, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    The gi_taxid_nucl is a huge table and therefore this function
    takes an age to run - can think of optimising this somehow
    '''
    to_cluster = True

    total = 0
    rel_abundance = collections.defaultdict(int)
    for fastq in Fastq.iterate(IOTools.openFile(infile)):
        total += 1
        gi = fastq.identifier.split("|")[1]
        rel_abundance[gi] += 1
    for gi, ab in rel_abundance.iteritems():
        rel_abundance[gi] = float(ab)/total

    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()
    result = collections.defaultdict(float)
    for gi in rel_abundance.keys():
        E.info("processing gi %s" % gi)
        taxid = cc.execute("""SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" % gi).fetchone()[0]
        species_id = cc.execute("""SELECT species_id FROM categories WHERE taxid == '%s'""" % taxid).fetchone()[0]
        species_name = cc.execute("""SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'""" % species_id).fetchone()[0]
        abundance = rel_abundance[gi]
        E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" % (str(gi), str(taxid), str(species_id), species_name))
        result[species_name] += abundance

    outf = open(outfile, "w")
    outf.write("species_name\trelab\n")
    for species_name, abundance in result.iteritems():
        # create names consistent with metaphlan
        species_name = species_name.replace(" ", "_")
        outf.write("%s\t%f\n" % (species_name, abundance))
    outf.close()
Пример #43
0
Файл: GTF.py Проект: yangjl/cgat
def iterator_sorted(gff_iterator, sort_order="gene"):
    '''sort input and yield sorted output.'''
    entries = list(gff_iterator)
    if sort_order == "gene":
        entries.sort(
            key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start))
    elif sort_order == "gene":
        entries.sort(key=lambda x: (x.gene_id, x.contig, x.start))
    elif sort_order == "contig+gene":
        entries.sort(
            key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start))
    elif sort_order == "transcript":
        entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start))
    elif sort_order == "position":
        entries.sort(key=lambda x: (x.contig, x.start))
    elif sort_order == "position+gene":
        entries.sort(key=lambda x: (x.gene_id, x.start))
        genes = list(flat_gene_iterator(entries))
        genes.sort(key=lambda x: (x[0].contig, x[0].start))
        entries = IOTools.flatten(genes)

    for entry in entries:
        yield entry
Пример #44
0
    #######################################################################
    ## retrieve structure
    if options.filename_pdb:
        infile = open(options.filename_pdb, "r")
        pdb_lines = infile.readlines()
        infile.close()
    else:
        pdb_lines = os.popen(param_retrieval_command %
                             string.lower(param_pdb_id)).readlines()

    viewer = PdbTools.RasmolViewInline(pdb_lines, sys.stdout)
    viewer.Command("echo %s" % message)

    if options.filename_fasta:
        infile = open(options.filename_fasta, "r")
        description, reference_sequence = IOTools.readSequence(infile)
        infile.close()
    else:
        reference_sequence = None

    if DEBUG:
        viewer.Command("echo cmdline: %s" % (string.join(sys.argv, " ")))

    if not pdb_lines:
        viewer.Command("echo error: structure not found in local database")
        viewer.WriteScript()
        sys.exit()

    if reference_sequence:
        map_pdb2seq, rmap_pdb2seq, rmap_seq2pdb, lstructure, first_residue, last_residue, sequence = PdbTools.buildMapPdb2Sequence(
            reference_sequence, options.filename_pdb, options,
Пример #45
0
#python ..\ChromatinImagingV2\Scripts\BatchAllnew.py
import sys, os
#add path
workbookDir = os.getcwd()
sys.path.append(os.path.dirname(workbookDir) + os.sep + r'\CommonTools')
import IOTools as io

if __name__ == "__main__":
    script = r'"' + workbookDir + os.sep + r'BatchSequentialSmall2colV3.py"'
    str_runs = []
    for i in range(10):
        str_runs.append('python ' + script + ' ' + str(i))
    io.batch_command(str_runs, batch_size=10)
Пример #46
0
                      dest="dump",
                      action="store_true",
                      help="dump output.")

    parser.set_defaults(
        separator="|",
        dump=False,
        filename_map=None,
        filename_alignment="-",
        filename_tree=None,
    )

    (options, args) = E.Start(parser)

    if options.filename_map:
        map_species2sp = IOTools.ReadMap(open(options.filename_map, "r"))

    E.debug("species map: %s" % str(map_species2sp))

    identifier_parser = IdentifierParserGPipe(map_species2sp=map_species2sp)

    njtree = NJTree(identifier_parser=identifier_parser)

    njtree.SetLog(options.stdlog)
    njtree.SetErr(options.stderr)

    if options.filename_tree:
        njtree.SetSpeciesTree(options.filename_tree)

    mali = Mali.Mali()
    if options.filename_alignment == "-":
Пример #47
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: CBioPortal.py 2888 2012-06-07 15:52:00Z ians $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--output_file",
        type="string",
        default=None,
        help="[Optional] Filename to output results to. [default=STDOUT]")
    parser.add_option(
        "-u",
        "--url",
        type="string",
        default="http://www.cbioportal.org/public-portal/webservice.do",
        help="[Optional] Url to the cBioPortal webservice [default=%default]")

    cqueryopts = optparse.OptionGroup(parser, "Common parameters",
                                      "Common arguments to the query")
    cqueryopts.add_option(
        "-s",
        "--study_id",
        dest="study_id",
        type="string",
        default=None,
        help=
        "[Required/OPtional]  cBioPortal ID for study [default=%default].\n This or study_name required for: getGeneticProfiles, getCaseLists, getProteinArrayInfo, getLink,getOncoprintHTML, getPercentAltered, getTotalAltered"
    )
    cqueryopts.add_option(
        "-n",
        "--study_name",
        dest="study_name",
        type="string",
        default=None,
        help=
        "[Required/Optional] cBioPortal Name for study [defualt=%default].\n See above for which commands require this."
    )
    cqueryopts.add_option(
        "-c",
        "--case_set_id",
        dest="case_set_id",
        type="string",
        default=None,
        help=
        "[Required for some] cBioPortal case_set_id specifying the case list to use.\nRequired for getProfileData, getMutationData, getClincalData, getProteinArrayData, getPercentAltered, getTotalAltered. Default is case_set_id for case list 'All Tumours' "
    )
    cqueryopts.add_option(
        "-g",
        "--gene_list",
        dest="gene_list",
        type="string",
        default=None,
        help=
        "[Required for some] Comma seperated list of HUGO gene symbols or Entrez gene IDs.\nRequired for getProfileData, getMutationData, getLink, getOncoprintHTML"
    )
    cqueryopts.add_option("-f",
                          "--gene_list_file",
                          dest="gene_list_file",
                          type="string",
                          default=None,
                          help="[Optional] Filename to read in gene_list from")
    cqueryopts.add_option(
        "-p",
        "--profile_id",
        dest="profile_id",
        type="string",
        help=
        "[Optional] Comma seperated list of cBioPortal genetic_profile_ids. If none are specified then the list of profiles for the study where display in analysis is True is used."
    )

    squeryopts = optparse.OptionGroup(
        parser, "Query specific parameters",
        "Arguments specific to a particular query")
    squeryopts.add_option(
        "--protein_array_type",
        dest="protein_array_type",
        type="string",
        default="protein_level",
        help=
        "[Optional] Either protein_level or phosphorylation [default=%default]"
    )
    squeryopts.add_option(
        "--protein_array_id",
        dest="protein_array_id",
        type="string",
        help=
        "[Required for some] comma seperated list of one or more protein array IDs"
    )
    squeryopts.add_option(
        "--array_info",
        dest="protein_array_info",
        type="int",
        default=0,
        help=
        "[Optional] If 1, antibody infomation will also be exported in a getProteinArrayData query [default=%default]"
    )
    squeryopts.add_option(
        "--report",
        dest="report",
        type="string",
        default="full",
        help=
        "[Optional] Report type to display for getLink. Either full or oncoprint_html [default=%default] "
    )
    squeryopts.add_option(
        "--threshold",
        dest="threshold",
        type="int",
        default=2,
        help=
        "[Optional] Threshold for deciding if an alteration is significant for continuous metrics [default=%default]"
    )

    parser.add_option_group(cqueryopts)
    parser.add_option_group(squeryopts)

    (options, args) = E.Start(parser,
                              add_pipe_options=False,
                              add_output_options=False,
                              argv=argv)

    portal = CBioPortal(url=options.url,
                        study=options.study_id,
                        study_name=options.study_name,
                        case_list_id=options.case_set_id)

    results = []

    if options.gene_list_file:
        infile = IOTools.openFile(options.gene_list_file)
        gene_list = [x.strip() for x in infile]
    elif options.gene_list:
        gene_list = options.gene_list.split(",")

    if options.profile_id:
        profile_id = options.profile_id.split(",")
    else:
        profile_id = None

    if "getCancerStudies" in args:
        results.append(portal.getCancerStudies())

    if "getGeneticProfiles" in args:
        results.append(portal.getGeneticProfiles())

    if "getCaseLists" in args:
        results.append(portal.getCaseLists())

    if "getProfileData" in args:
        results.append(
            portal.getProfileData(gene_list=gene_list,
                                  genetic_profile_id=profile_id))

    if "getMutationData" in args:
        results.append(
            portal.getMutationData(gene_list=gene_list,
                                   genetic_profile_id=profile_id))

    if "getClinicalData" in args:
        results.append(portal.getClinicalData())

    if "getProteinArrayInfo" in args:
        results.append(
            portal.getProteinArrayInfo(
                gene_list=gene_list,
                protein_array_type=options.protein_array_type))

    if "getProteinArrayData" in args:
        results.append(
            portal.getProteinArrayData(
                protein_array_id=options.protein_array_id,
                array_info=options.array_info))

    if "getPercentAltered" in args:
        results.append(
            portal.getPercentAltered(gene_list=gene_list,
                                     genetic_profile_id=profile_id,
                                     threshold=options.threshold))

    if "getLink" in args:
        results.append(
            portal.getLink(gene_list=gene_list, report=options.report))

    if "getOncoprintHTML" in args:
        results.append(portal.getOncoprintHTML(gene_list=gene_list))

    if len(results) == 0:
        sys.stderr.write("No recognised query commands provided")
        sys.exit()

    if options.output_file:
        outf = IOTools.openFile(options.output_file, "w")
    else:
        outf = sys.stdout

    for result in results:
        try:
            outf.write(tableToString(result))
        except:
            outf.write(result)

    E.Stop()
Пример #48
0
def main(config_file_path):
    #config loading
    var_list = ['base_directory', 'organism', 'input_type', 'file_name', 'section_length', 'S1_iVal', 'S1_piVal', 'S2_iVal', 'S2_piVal', \
                'reference', 'optimize']

    config_file_path = sys.argv[1]
    config = configparser.SafeConfigParser()
    config.read(config_file_path)

    #setup
    start_time = time.time()

    #Settings
    try:
        output_directory = Path(config.get('Settings', 'output_directory'))
        input_path = Path(config.get('Settings', 'input_path'))
        prefix = input_path.parts[-1].split('.')[0]

        #set these according to config info
        s1_params = pw.ParamWrapper()
        s2_params = pw.ParamWrapper()

        s1_params.setSectionLength(config.getint('Settings', 'section_length'))
        s1_params.setIVal(config.getfloat('Settings', 'S1_iVal'))
        s1_params.setPiVal(config.getfloat('Settings', 'S1_piVal'))

        s2_params.setIVal(config.getfloat('Settings', 'S2_iVal'))
        s2_params.setPiVal(config.getfloat('Settings', 'S2_piVal'))

        #set stuff for autogroup
        s2_params.setIMax(10)
        s2_params.setIMin(2)
        s2_params.setIStep(0.5)
        s2_params.setPiMax(10)
        s2_params.setPiMin(1)
        s2_params.setPiStep(0.5)

        reference = config.get('Settings', 'reference')
        autogroup = bool(config.getboolean('Settings', 'autogroup'))

    except:
        raise RuntimeError('Error reading configuration file')

    #output paths
    if not output_directory.is_dir():
        output_directory.mkdir()
    os.chdir(output_directory)

    cytoscape_path = Path("{0}.xgmml".format(prefix))
    json_path = Path("{0}.json".format(prefix))

    tab_network_path = Path("chromosome_paintings.tsv")
    matrixout_path = Path("overall_similarity.tsv")
    heatmaps_path = Path("heatmaps.pdf")

    density_path = Path("density.txt")
    group_path = Path("groups.txt")
    tab_path = Path("tab.txt")

    colorout_path = Path("colors.txt")
    log_path = Path("log.txt")
    nn_out_path = Path("{0}_nn.tsv".format(prefix))

    hdf_path = input_path.parent / '{0}.h5'.format(prefix)
    matrices_hdf_path = input_path.parent / '{0}_matrices.h5'.format(prefix)
    save_state_path = input_path.parent / '{0}_savestate.json'.format(prefix)

    #other variables
    logger = configLogger(log_path)

    #sanitize input
    try:
        assert (input_path.is_file())
        assert (0 <= s1_params.getPiVal() <= 20)
        assert (0 <= s1_params.getIVal() <= 20)
        assert (0 <= s2_params.getPiVal() <= 20)
        assert (0 <= s2_params.getIVal() <= 20)
    except:
        raise ValueError('Configuration file contains bad values')

    #let's log some params used later
    logger.info('config loaded')

    #Input Processing

    #true means need to cluster
    if not io.checkPrimaryClustering(s1_params, save_state_path):
        logger.info('Primary Clustering exists, loading existing matrices')
        save_state, matrices = io.loadSaveState(save_state_path,
                                                matrices_hdf_path)
        sample_list = save_state['sample_list']
        chr_names = save_state['chr_names']
        chr_breaks = save_state['chr_breaks']
        io.writeTab(sample_list, tab_path)
    else:
        #tabular data from GTAK loaded to pandas
        df, sample_list = loadToPandas(hdf_path, input_path, reference,
                                       s1_params, True)
        os.chdir(output_directory)

        logger.info('Start Primary Clustering')
        io.writeTab(sample_list, tab_path)
        #TODO: check for whether we're skipping primary clustering
        clusters, chr_names, chr_breaks = primaryCluster(
            df, sample_list, s1_params, logger)
        genNNData(clusters, chr_names, chr_breaks, s1_params, sample_list,
                  nn_out_path)
        io.writePrimaryClusters(chr_names, chr_breaks, clusters,
                                Path('pclusters.txt'))
        matrices = at.clustersToMatrix(clusters, sample_list)
        logger.info('Writing Save State')
        io.writeSaveState(s1_params, sample_list, chr_names, chr_breaks,
                          matrices, save_state_path, matrices_hdf_path)

    overall_matrix = at.overallMatrix(matrices)

    logger.info('Start Secondary Clustering')
    group_names, overall_clusters = sc.group(overall_matrix, tab_path,
                                             group_path, s2_params, logger,
                                             autogroup)

    color_table = at.createColorTable(group_names, overall_clusters,
                                      sample_list)
    color_table.to_csv(colorout_path)

    logger.info('calculating composition')
    condensed_matrices = gc.condenseToGroupMatrix(matrices, group_names,
                                                  overall_clusters,
                                                  sample_list)
    composition = gc.getChromosomePaintings(condensed_matrices, chr_breaks,
                                            overall_clusters, group_names,
                                            sample_list)

    #    for the whole thing
    logger.info('writing output')
    io.writeTabularPainting(composition, chr_names,
                            s1_params.getSectionLength(), sample_list,
                            tab_network_path)
    io.writeOverallMatrix(overall_matrix, matrixout_path)
    exporter.parse(overall_matrix, color_table, composition, group_names,
                   overall_clusters, sample_list, prefix)
    print("PopNet Completed")
    print('Run time was {0} seconds'.format(time.time() - start_time))
Пример #49
0
                  -1]  #zxy coords of chromosomes already in the right position

    #Decide where to save the candidate positions of the hybe
    fl_cands = analysis_folder + os.sep + file_.replace(
        '.dax', '__current_cand.pkl')  #file where to save candidates
    fl_cor = analysis_folder + os.sep + file_.replace(
        '.dax', '__drift.pkl')  #file where to save drift correction
    fl_cor_fls = analysis_folder + os.sep + file_.replace(
        '.dax', '__driftfiles.npy')

    print fl_cands

    candid_spot = {}

    #load data (pseudo-memory mapping)
    daxs_signal, names_signal, daxs_beads, names_beads = io.get_ims_fov_daxmap(
        folders_keep, file_, col_tags=None, pad=10)
    #compute the drift for the field of view
    if len(daxs_beads) > 1:
        txyz_both, _ = ft.get_STD_beaddrift_v2(daxs_beads,
                                               sz_ex=sz_ex,
                                               hseed=hseed_beads,
                                               nseed=nbeads,
                                               ref=None,
                                               force=force_drift,
                                               save_file=fl_cor)
        txyz = np.mean(txyz_both, 1)
        txyz = np.array(txyz) - [txyz[ref]]
        np.save(fl_cor_fls, np.array(folders_keep))
    #repeat for colors
    #num_col = int(len(daxs_signal)/len(daxs_beads))
    #iterate through folders
Пример #50
0
    headers, titles, sets = [], [], []

    if options.headers:
        if options.headers == "-":
            headers = args
        else:
            headers = options.headers.split(",")
            if len(headers) != len(args):
                raise ValueError(
                    "please supply the same number of headers as there are filenames."
                )

    for f in args:
        if options.with_title:
            title, data = IOTools.readList(open(f, "r"),
                                           with_title=options.with_title)
            titles.append(title)
        else:
            data = IOTools.readList(open(f, "r"))
        sets.append(set(data))

    if not headers and titles:
        headers = titles
    else:
        headers = args

    for x in range(len(sets) - 1):
        set1 = sets[x]

        for y in range(x + 1, len(sets)):
            set2 = sets[y]
Пример #51
0
def buildSCOPDomains( infiles, outfile ):
    '''reconcile mapped domains into a single domain file.

    * fragments are removed - a domain must map at least 90%
      of its length.

    * domains overlapping on the same sequence with the same
      superfamily classification are merged.
    '''
    
    linksfile, fastafile = infiles

    # filtering criteria
    min_coverage = 0.9
    # only take first four fold classes
    classes = 'abcd'

    rx = re.compile('(\S+)\s(\S+)\s(.*)' )
    id2class = {}
    with IOTools.openFile( fastafile ) as inf:
        for x in FastaIterator.iterate( inf ):
            pid, cls, description = rx.match(x.title).groups()
            id2class[pid] = (cls, len(x.sequence) )
            
    E.info('read mappings for %i sequences' % len(id2class))
    counter = E.Counter()

    with IOTools.openFile( linksfile ) as inf:
        nid2domains = collections.defaultdict( list )
        ndomains = 0
        for line in inf:
            if line.startswith('query_nid'): continue
            if line.startswith('#'): continue
            counter.links += 1
            
            domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \
                block_sizes, domain_starts, sbjct_starts, \
                bitscore, pid = line[:-1].split()
            
            nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \
                                                                       ( nid, domain_start, domain_end, sbjct_start, sbjct_end ))

            family, length = id2class[domain_id]

            cls, fold, superfamily, family = family.split('.')
            if cls not in classes: continue
            if float(domain_end - domain_start) / length < min_coverage: continue
            counter.unmerged_domains += 1
            superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily))

            nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) )

        counter.sequences = len(nid2domains)

    E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences))

    outf = IOTools.openFile( outfile, 'w' )
    outf.write('nid\tstart\tend\tfamily\n')
    for nid, dd in sorted(nid2domains.iteritems()):
        for family, domains in itertools.groupby( dd, key = lambda x: x[0] ):
            unmerged_domains = [ (x[1],x[2]) for x in domains ]
            merged_domains = Intervals.combine( unmerged_domains )
            for start, end in merged_domains:
                counter.domains += 1
                outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) )
    outf.close()

    E.info( counter )
Пример #52
0
 def testOutPut(self):
     overall = pd.DataFrame([[2, 0, 2], [0, 2, 0], [2, 0, 2]],
                            columns=['A', 'B', 'C'],
                            index=['A', 'B', 'C'])
     io.writeOverallMatrix(overall, 'zzz.txt')
Пример #53
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("bed", "bam"),
                      help="input file format [default=%default].")

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-e",
                      "--extension",
                      dest="extension",
                      type="int",
                      help="extension size [default=%default].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="int",
                      help="bin size of genome vector [default=%default].")

    parser.add_option("-l",
                      "--fragment-length",
                      dest="fragment_length",
                      type="int",
                      help="bin size of genome vector [default=%default].")

    parser.add_option(
        "-s",
        "--saturation-iterations",
        dest="saturation_iterations",
        type="int",
        help="iterations for saturation analysis [default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "rms", "rpm", "all"),
                      help="actions to perform [default=%default].")

    parser.add_option(
        "-w",
        "--bigwig",
        dest="bigwig",
        action="store_true",
        help=
        "store wig files as bigwig files - requires a genome file [default=%default]"
    )

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="hg19",
        genome_file=None,
        extension=400,
        bin_size=50,
        saturation_iterations=10,
        fragment_length=700,
        toolset=[],
        bigwig=False,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if len(args) != 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    filename_sample = args[0]

    if len(options.toolset) == 0: options.toolset = ["all"]

    do_all = "all" in options.toolset

    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome
    R.library(genome_file)

    tmpdir = tempfile.mkdtemp()

    E.debug("temporary files are in %s" % tmpdir)

    bin_size = options.bin_size
    extension = options.extension
    fragment_length = options.fragment_length
    saturation_iterations = options.saturation_iterations

    if options.input_format == "bam":
        E.info("converting bam files")
        filename_sample = bamToMEDIPS(filename_sample,
                                      os.path.join(tmpdir, "sample.medips"))
    elif options.input_format == "bed":
        E.info("converting bed files")
        filename_sample = bedToMEDIPS(filename_sample,
                                      os.path.join(tmpdir, "sample.medips"))

    E.info("loading data")
    R('''CONTROL.SET = MEDIPS.readAlignedSequences(
                       BSgenome = "%(genome_file)s", 
                       file = "%(filename_sample)s" ) ''' % locals())
    slotnames = (("extend", "extend",
                  "%i"), ("distFunction", "distance_function",
                          "%s"), ("slope", "slope", "%f"),
                 ("fragmentLength", "fragment_length",
                  "%i"), ("bin_size", "bin_size",
                          "%i"), ("seq_pattern", "pattern",
                                  "%s"), ("number_regions", "nregions", "%i"),
                 ("number_pattern", "npatterns",
                  "%i"), ("cali_chr", "calibration_contig",
                          "%s"), ("genome_name", "genome", "%s"))

    E.info("computing genome vector")
    R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, 
                       bin_size = %(bin_size)i, 
                       extend=%(extension)i )''' % locals())

    E.info("computing CpG positions")
    R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")'''
      )

    E.info("compute coupling vector")
    R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, 
                       fragmentLength = %(fragment_length)i, 
                       func = "count")''' % locals())

    E.info("compute calibration curve")
    R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''')

    E.info("normalizing")
    R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''')

    outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w")
    outfile.write("category\tvalue\n")

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, 
                            bin_size = %(bin_size)i, 
                            extend = %(extension)i, 
                            no_iterations = %(saturation_iterations)i, 
                            no_random_iterations = 1)''' % locals())

        R.png(E.getOutputFile("saturation.png"))
        R('''MEDIPS.plotSaturation(sr.control)''')
        R('''dev.off()''')

        R('''write.csv( sr.control$estimation, file ='%s' )''' %
          E.getOutputFile("saturation_estimation.csv"))
        outfile.write("estimated_correlation\t%f\n" %
                      R('''sr.control$maxEstCor''')[1])
        outfile.write("true_correlation\t%f\n" %
                      R('''sr.control$maxTruCor''')[1])

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, 
                                extend = %(extension)i, 
                                no_iterations = 10)''' % locals())

        R.png(E.getOutputFile("cpg_coverage.png"))
        R('''MEDIPS.plotCoverage(cr.control)''')
        R('''dev.off()''')

        # three rows
        R('''write.csv( cr.control$coveredPos, file ='%s' )''' %
          E.getOutputFile("saturation_coveredpos.csv"))
        # coverage threshold
        # number of CpG covered
        # percentage of CpG covered

        R('''write.csv( cr.control$matrix, file ='%s' )''' %
          E.getOutputFile("saturation_matrix.csv"))

        # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''')

    if "calibration" in options.toolset or do_all:
        E.info("plotting calibration")
        R.png(E.getOutputFile("calibration.png"))
        R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)'''
          )
        R('''dev.off()''')

    for slotname, label, pattern in slotnames:
        value = tuple(R('''CONTROL.SET@%s''' % slotname))
        if len(value) == 0: continue
        outfile.write(
            "%s\t%s\n" %
            (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname))[0]))

    outfile.close()

    if "rpm" in options.toolset or do_all:
        outputfile = E.getOutputFile("rpm.wig")
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")'''
          % locals())
        if options.bigwig:
            bigwig(outputfile, contig_sizes)
        else:
            compress(outputfile)

    if "rms" in options.toolset or do_all:
        outputfile = E.getOutputFile("rms.wig")
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")'''
          % locals())
        if options.bigwig:
            bigwig(outputfile, contig_sizes)
        else:
            compress(outputfile)

    shutil.rmtree(tmpdir)

    ## write footer and output benchmark information.
    E.Stop()