Python info示例，cgatcore.experiment.info Python示例

示例#1

0

显示文件

def summarizeFastQC(infiles, outfiles):
    all_files = []
    for infile in infiles:
        track = P.snip(infile, ".fastqc")
        all_files.extend(glob.glob(
            os.path.join(track + "*_fastqc",
                         "fastqc_data.txt")))

    dfs = PipelineReadqc.read_fastqc(
        all_files)

    for key, df in dfs.items():
        fn = re.sub("basic_statistics", key, outfiles[0])
        E.info("writing to {}".format(fn))
        with IOTools.open_file(fn, "w") as outf:
            df.to_csv(outf, sep="\t", index=True)

示例#2

0

显示文件

    def removeObservationsPerc(self, percentile_rowsums=10):
        '''remove Observations (e.g genes)

        * remove the lowest percentile of rows in the table, sorted
           by total tags per row
        '''

        # percentile filtering
        percentile = float(percentile_rowsums) / 100.0
        sum_counts = self.table.sum(1)
        take = sum_counts >= sum_counts.quantile(percentile)
        E.info("percentile filtering at level %f: keep=%i, discard=%i" %
               (percentile_rowsums,
                sum(take),
                len(take) - sum(take)))
        self.table = self.table[take]

示例#3

0

显示文件

def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.open_file(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.open_file(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.open_file(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()

示例#4

0

显示文件

文件： cat_tables.py 项目： alphaneer/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0 or (len(args) == 1 and args[0] == "-"):
        infile = options.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        options.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()

示例#5

0

显示文件

文件： PreProcess.py 项目： OxfordCMS/OCMS_Shotgun

    def run(self, *args, **PARAMS):

        # Custom command to run reference matching tool.
        statement, run_options = self.buildStatement(**PARAMS)

        # Logging
        runfiles = '\t'.join([os.path.basename(x) for x in (self.fastn1, \
                                                            self.fastn2, \
                                                            self.fastn3) if x])
        E.info("Running sortMeRNA for files: {}".format(runfiles))

        P.run(statement, job_options=run_options)

        # Post process results into generic output for downstream tasks.
        statement = self.postProcess(**PARAMS)
        if statement:
            P.run(statement, run_options)

示例#6

0

显示文件

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if len(unknown) == 0 or (len(unknown) == 1 and unknown[0] == "-"):
        infile = args.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        args.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()

示例#7

0

显示文件

文件： pipeline_expression.py 项目： tw7649116/cgat-flow

def estimateExpression(infiles, outfile):
    '''estimate expression levels.'''

    R.library("affy")

    E.info("reading data")

    raw_data = R.ReadAffy(infiles)

    E.info("RMA normalization")

    eset = R.rma(raw_data)

    R.boxplot(raw_data)
    R.boxplot(eset)

    print(R.as_list(R.assayData(eset)))

示例#8

0

显示文件

文件： fasta2bed.py 项目： harmeet1990/cgat-apps

def segmentFixedWidthWindows(infile, window_size, window_shift):
    """return a list of fixed contig sizes."""

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)
    window_shift = window_size
    # at most 50% can be gap
    gap_cutoff = int(window_size // 2)
    segments = []

    while 1:
        ninput += 1
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        seq = cur_record.sequence
        size = len(cur_record.sequence)

        for x in range(0, size, window_shift):
            s = seq[x:x + window_size].upper()
            gc, at = 0, 0
            for c in s:
                if c in "GC":
                    gc += 1
                elif c in "AT":
                    at += 1

            # skip segments containing mostly gaps
            if window_size - (gc + at) > gap_cutoff:
                nskipped += 1
                continue

            segments.append(
                (contig, x, x + window_size, float(gc) / (gc + at)))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped_windows=%i" %
           (ninput, noutput, nskipped))

    return segments

示例#9

0

显示文件

def make_report():
    ''' Generates html and pdf versions of restructuredText files
        using sphinx-quickstart pre-configured files (conf.py and Makefile).
        Pre-configured files need to be in a pre-existing report directory.
        Existing reports are overwritten.
    '''
    report_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), 'pipeline_report'))
    print('Copying report templates from: {}'.format(report_path))

    if (os.path.exists(report_dir) and os.path.isdir(report_dir)
            and not os.listdir(report_dir)):
        statement = '''cp %(report_path)s/* pipeline_report ;
                       cd {} ;
                       ln -s ../pipeline.yml . ;
                       make html ;
                       ln -sf _build/html/report_pipeline_pq_example.html . ;
                       make latexpdf ;
                       ln -sf _build/latex/pq_example.pdf .
                    '''.format(report_dir)
        E.info('''Building pdf and html versions of your rst files in
                  {}.'''.format(report_dir))
        P.run(statement)

    elif (os.path.exists(report_dir) and os.path.isdir(report_dir)
          and os.listdir(report_dir)):
        sys.exit(''' {} exists, not overwriting. You can manually run:
                       cd {} ;
                       ln -s ../pipeline.yml . ;
                       make html ;
                       ln -sf _build/html/report_XXXX.html . ;
                       make latexpdf ;
                       ln -sf _build/latex/XXXX.pdf .
                       Or delete the folder and re-run make_report
                 '''.format(report_dir))

    else:
        sys.exit(''' The directory "pipeline_report" does not exist.
                     Are the paths correct?
                     Template files were tried to be copied from:
                     {}
                     You can also manually copy files and run "make html" or
                     "make latexpdf".
                 '''.format(report_path))

    return

示例#10

0

显示文件

def renameChromosomes(iterator, chr_map):

    ninput, noutput, nskipped = 0, 0, 0

    for bed in iterator:
        ninput += 1

        if bed.contig in chr_map.keys():
            bed.contig = chr_map[bed.contig]
        else:
            nskipped += 1
            continue

        noutput += 1
        yield bed

    E.info("ninput = %i, noutput=%i, nskipped=%i" %
           (ninput, noutput, nskipped))

示例#11

0

显示文件

def runCPC(infile, outfile):
    '''
    run coding potential calculations on lncRNA geneset
    '''
    # farm.py is called from within cpc.sh
    assert iotools.which("farm.py"), \
        "farm.py needs to be in $PATH for cpc to run"
    # Default cpc parameters don't work with later versions of blast
    E.info("Running cpc with blast version:%s" % iotools.which("blastx"))

    result_evidence = P.snip(outfile, ".result") + ".evidence"
    working_dir = "cpc"
    statement = ("%(pipeline_scriptsdir)s/cpc.sh"
                 " %(infile)s"
                 " %(outfile)s"
                 " %(working_dir)s"
                 " %(result_evidence)s")
    P.run()

示例#12

0

显示文件

文件： __init__.py 项目： tw7649116/cgat-flow

def make_mapped_matrix(map_dict, input_frame):
    '''
    return a matrix with integer labels from mapping
    '''

    frame_index = input_frame.index.tolist()
    nindex = len(frame_index)
    ncols = len(input_frame.columns)
    integer_matrix = np.ndarray((nindex, ncols), dtype=np.int32)

    E.info("mapping cluster labels")
    matrix_idx = [h for h, g in enumerate(frame_index)]
    for idx in matrix_idx:
        for col in range(ncols):
            mod = input_frame.iloc[idx][col + 1]
            integer_matrix[idx][col] = map_dict[mod]

    return integer_matrix

示例#13

0

显示文件

def read_and_randomize_rows(infile, args):
    """read table from stdin and randomize rows, keeping header."""

    c = E.Counter()
    if args.has_headers:
        keep_header = 1
    else:
        keep_header = 0
    for x in range(keep_header):
        c.header += 1
        args.stdout.write(infile.readline())

    lines = infile.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    args.stdout.write("".join(lines))
    c.lines_output = len(lines)
    E.info(c)

示例#14

0

显示文件

def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--to-drop-single", dest='to_remove_singletons')
    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-drop1", dest="fq_dropped1")

    (options, args) = E.start(parser)

    reads_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    reads_to_remove = set([x.strip() for x in reads_to_remove])

    fastq_out = IOTools.open_file(options.fq_out1, 'w')
    fastq_host = IOTools.open_file(options.fq_dropped1, 'w')

    reads = 0
    dropped_reads = 0
    for read in Fastq.iterate(IOTools.open_file(fastq1)):
        reads += 1
        if read.identifier.split()[0] in reads_to_remove:
            fastq_host.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))
            dropped_reads += 1
        else:
            fastq_out.write("@%s\n%s\n+\n%s\n" %
                            (read.identifier, read.seq, read.quals))

    fastq_out.close()
    fastq_host.close()

    try:
        percent_dropped = dropped_reads / float(reads) * 100
    except ZeroDivisionError:
        percent_dropped = 0.0

    E.info('Dropped %i of %i reads (%f percent)' \
           % (dropped_reads, reads, percent_dropped))

示例#15

0

显示文件

def renameChromosomes(gffs, chr_map):

    ninput, noutput, nskipped = 0, 0, 0

    for gff in gffs:

        ninput += 1

        if gff.contig in chr_map.keys():
            gff.contig = chr_map[gff.contig]
        else:
            nskipped += 1
            continue

        noutput += 1
        yield gff

    E.info("ninput = %i, noutput=%i, nskipped=%i" %
           (ninput, noutput, nskipped))

示例#16

0

显示文件

def shiftIntervals(iterator, contigs, offset):
    """shift intervals by a certain offset and ensure size is maintaned even id contig end reached.

    contigs is a dictionary of contig sizes."""

    ninput, noutput = 0, 0
    nskipped_contig, nskipped_range = 0, 0

    for bed in iterator:
        ninput += 1
        if bed.contig not in contigs:
            nskipped_contig += 1
            continue
        # IMS: if we skip intervals off the end of the contig we should skipp ones
        # off the start as well
        if bed.start < 0 or bed.end < 0:
            nskipped_range += 1
            continue
        # IMS: changing >= to > as bed is half-open
        if bed.end > contigs[bed.contig]:
            nskipped_range += 1
            continue
        noutput += 1

        # add offset to each start and end, and adjust for contig length
        l = bed.end - bed.start
        newstart = bed.start + offset
        newend = bed.end + offset
        if newstart < 0:
            newstart = 0
            newend = l
        if newend > contigs[bed.contig]:
            newstart = contigs[bed.contig] - l
            newend = contigs[bed.contig]

        bed.start = newstart
        bed.end = newend

        yield bed

    E.info("ninput=%i, noutput=%i, nskipped_contig=%i, nskipped_range=%i" %
           (ninput, noutput, nskipped_contig, nskipped_range))

示例#17

0

显示文件

文件： pipeline_velocyto.py 项目： Acribbs/scflow

def mergeBAMFiles(infiles, outfile):
    '''merge BAM files from the same experiment using user-defined regex
    For the mapping stages it is beneficial to perform mapping
    seperately for each sequence read infile(s) per sample so that
    the consistency can be checked. However, for downstream tasks,
    the merged :term:`bam` alignment files are required.
    Parameters
    ----------
    infiles : list
       list of :term:`bam` format alignment files
    outfile : str
       Output filename in :term:`bam` format
    '''

    if "merge_pattern_output" not in PARAMS or \
       not PARAMS["merge_pattern_output"]:
        raise ValueError("no output pattern 'merge_pattern_output' specified")

    if len(infiles) == 1:
        if not os.path.isfile(os.path.join(infiles[0], outfile)):
            E.info("%(outfile)s: only one file for merging - creating "
                   "softlink" % locals())
            os.symlink(os.path.basename(infiles[0]), outfile)
            os.symlink(os.path.basename(infiles[0]) + ".bai", outfile + ".bai")
            return
        else:
            E.info("%(outfile)s: only one file for merging - softlink "
                   "already exists" % locals())
            return

    infiles = " ".join(infiles)
    tmp_bam = P.get_temp_filename(".")

    statement = '''
    samtools merge %(tmp_bam)s %(infiles)s >& %(outfile)s_merge.log &&
    samtools sort %(tmp_bam)s -o %(outfile)s &&
    samtools index %(outfile)s
    '''

    job_memory = '20G'

    P.run(statement)

示例#18

0

显示文件

文件： pipeline_idr.py 项目： tw7649116/cgat-flow

def runIDROnPooledPseudoreplicates(infiles, outfile):
    """
    Run IDR analysis on pooled pseudoreplicates for each EXPERIMENT
    """
    # set IDR parameters
    chr_table = os.path.join(PARAMS["annotations_dir"],
                             PARAMS["annotations_interface_contigs"])

    # get statement
    statement = IDR.getIDRStatement(infiles[0],
                                    infiles[1],
                                    outfile,
                                    PARAMS["idr_options_overlap_ratio"],
                                    PARAMS["idr_options_ranking_measure"],
                                    chr_table)

    # run
    E.info("applyIDR: processing %s and %s" % (infiles[0], infiles[1]))
    job_memory = "5G"
    P.run()

示例#19

0

显示文件

文件： diff_bed.py 项目： harmeet1990/cgat-apps

    def count(self, filename1, filename2):
        """count overlap between two bed files."""

        E.info("counting started for %s versus %s" % (filename1, filename2))

        idx2 = self.buildIndex(filename2)

        (self.mExons1, self.mExonsOverlapping1, self.mBases1,
         self.mBasesOverlapping1) = self._count(filename1, idx2)

        self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1
        self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1

        idx1 = self.buildIndex(filename1)

        (self.mExons2, self.mExonsOverlapping2, self.mBases2,
         self.mBasesOverlapping2) = self._count(filename2, idx1)

        self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2
        self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2

示例#20

0

显示文件

文件： diff_bed.py 项目： harmeet1990/cgat-apps

    def count(self, filename, track):
        """count overlap between two gtf files."""

        E.info("counting started for %s versus %s" % (filename, track))

        (self.mExons1, self.mExonsOverlapping1, self.mBases1,
         self.mBasesOverlapping1) = self._count(filename, self.mIndices[track])

        self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1
        self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1

        idx = self.buildIndex(filename)

        # count index against index
        (self.mExons2, self.mExonsOverlapping2, self.mBases2,
         self.mBasesOverlapping2) = self._countIndices(self.mIndices[track],
                                                       idx)

        self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2
        self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2

示例#21

0

显示文件

文件： gsenrichment.py 项目： tw7649116/cgat-flow

def getTables(dbname):
    '''
    Retrieves the names of all tables in the database.
    Groups tables into dictionaries by annotation
    '''
    dbh = sqlite3.connect(dbname)
    c = dbh.cursor()
    statement = "SELECT name FROM sqlite_master WHERE type='table'"
    c.execute(statement)
    tables = c.fetchall()
    c.close()
    dbh.close()
    D = {}
    for t in tables:
        tname = t[0].replace("ensemblg2", "").split("$")
        E.info(tname)
        ttype = tname[0]
        D.setdefault(ttype, [])
        D[ttype].append(tname[1])
    return D

示例#22

0

显示文件

文件： bed2table.py 项目： harmeet1990/cgat-apps

    def __init__(self, filename, *args, **kwargs):

        assert filename is not None,\
            "please supply filename for CounterOverlap"

        Counter.__init__(self, *args, **kwargs)

        self.filename = filename

        E.info("reading intervals from %s" % self.filename)

        self.index = Bed.readAndIndex(iotools.open_file(self.filename, "r"),
                                      per_track=True)

        E.info("read intervals for %s tracks" % len(self.index))

        self.tracks = list(self.index.keys())
        self.headers = []
        for track in self.tracks:
            self.headers.extend(["%s_nover" % track, "%s_bases" % track])

示例#23

0

显示文件

    def buildGenomeAlignment(infiles, outfile):
        '''build pairwise genomic aligment from axt files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        for infile in infiles:
            E.info("adding %s" % infile)
            statement = '''gunzip < %(infile)s
            | axtToPsl
            /dev/stdin
            %(query)s.sizes
            %(target)s.sizes
            /dev/stdout
            | pslSwap /dev/stdin /dev/stdout
            | gzip >> %(outfile)s
            '''
            P.run()

示例#24

0

显示文件

文件： randomize_lines.py 项目： alphaneer/cgat-apps

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k",
                      "--keep-header",
                      dest="keep_header",
                      type="int",
                      help="randomize, but keep header in place [%default]")

    parser.set_defaults(keep_header=0)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    inf = options.stdin
    outf = options.stdout
    c = E.Counter()
    for x in range(options.keep_header):
        c.header += 1
        outf.write(inf.readline())

    lines = inf.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    for line in lines:
        outf.write(line)
    c.lines_output = len(lines)

    E.info(c)

    # write footer and output benchmark information.
    E.stop()

示例#25

0

显示文件

def loadBAMStats(infiles, outfile):
    '''Import bam statistics into SQLite'''

    scriptsdir = PARAMS["general_scriptsdir"]
    header = ",".join(
        [P.snip(os.path.basename(x), ".readstats") for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)
    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | perl -p -e "s/unique/unique_alignments/"
                | cgat table2table --transpose
                | cgat csv2db
                      --allow-empty-file
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s"""
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
        tname = "%s_%s" % (tablename, suffix)

        statement = """cgat combine_tables
                      --header-names=%(header)s
                      --skip-titles
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/%(suffix)s/"
                | cgat csv2db
                      --table=%(tname)s 
                      --allow-empty-file
                >> %(outfile)s """
        P.run()

示例#26

0

显示文件

def summarizeReadCounts(infiles, outfile):
    '''Calculate the number of reads lost at each step for each sample'''

    with IOTools.open_file(outfile, 'w') as outf:
        outf.write("sample_id\tinput_reads\toutput_reads\tduplicates\t"
                   "adapter_contamination\trRNA\thost\tlow_complexity\t"
                   "duplicates_percent\tadapters_percent\trrna_percent\t"
                   "host_percent\tlow_complexity_perc\tremaining_percent\n")
        for infile in infiles:
            sample_id = P.snip(os.path.basename(infile),
                               '_read_count_summary.tsv')
            E.info('Processing sample %s' % sample_id)

            df = pd.read_table(infile, index_col=0, header=None)
            deadapt = df.loc['deadapt', 1]
            deduped = df.loc['deduped', 1]
            rrna = df.loc['rRNAremoved', 1]
            dehost = df.loc['dehost', 1]
            masked = df.loc['masked', 1]
            input_reads = df.loc['input', 1]

            lost_dup = input_reads - deduped
            lost_adapt = deduped - deadapt
            lost_rrna = deadapt - rrna
            lost_host = rrna - dehost
            lost_mask = dehost - masked

            lost_dup_perc = round(lost_dup / float(input_reads) * 100, 2)
            lost_adapt_perc = round(lost_adapt / float(input_reads) * 100, 2)
            lost_rrna_perc = round(lost_rrna / float(input_reads) * 100, 2)
            lost_host_perc = round(lost_host / float(input_reads) * 100, 2)
            lost_mask_perc = round(lost_mask / float(input_reads) * 100, 2)
            output_perc = round(masked / float(input_reads) * 100, 2)

            outf.write('\t'.join(
                map(str, [
                    sample_id, input_reads, masked, lost_dup, lost_adapt,
                    lost_rrna, lost_host, lost_mask, lost_dup_perc,
                    lost_adapt_perc, lost_rrna_perc, lost_host_perc,
                    lost_mask_perc, output_perc
                ])) + '\n')

示例#27

0

显示文件

def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

示例#28

0

显示文件

文件： motifs.py 项目： kevinrue/cgat-flow

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.get_temp_file(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(iotools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

示例#29

0

显示文件

def write_config_files(pipeline_path, general_path):
    '''create default configuration files in `path`.
    '''

    paths = [pipeline_path, general_path]
    config_files = ['pipeline.yml']

    for dest in config_files:
        if os.path.exists(dest):
            E.warn("file `%s` already exists - skipped" % dest)
            continue

        for path in paths:
            src = os.path.join(path, dest)
            if os.path.exists(src):
                shutil.copyfile(src, dest)
                E.info("created new configuration file `%s` " % dest)
                break
        else:
            raise ValueError("default config file `%s` not found in %s" %
                             (config_files, paths))

示例#30

0

显示文件

文件： PipelineMetagenomeCommunities.py 项目： tw7649116/cgat-flow

def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = iotools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = iotools.openFile(infile)
    header = inf.readline()
    outf = iotools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()