示例#1
0
def summarizeFastQC(infiles, outfiles):
    all_files = []
    for infile in infiles:
        track = P.snip(infile, ".fastqc")
        all_files.extend(glob.glob(
            os.path.join(track + "*_fastqc",
                         "fastqc_data.txt")))

    dfs = PipelineReadqc.read_fastqc(
        all_files)

    for key, df in dfs.items():
        fn = re.sub("basic_statistics", key, outfiles[0])
        E.info("writing to {}".format(fn))
        with IOTools.open_file(fn, "w") as outf:
            df.to_csv(outf, sep="\t", index=True)
示例#2
0
    def removeObservationsPerc(self, percentile_rowsums=10):
        '''remove Observations (e.g genes)

        * remove the lowest percentile of rows in the table, sorted
           by total tags per row
        '''

        # percentile filtering
        percentile = float(percentile_rowsums) / 100.0
        sum_counts = self.table.sum(1)
        take = sum_counts >= sum_counts.quantile(percentile)
        E.info("percentile filtering at level %f: keep=%i, discard=%i" %
               (percentile_rowsums,
                sum(take),
                len(take) - sum(take)))
        self.table = self.table[take]
示例#3
0
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.open_file(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.open_file(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.open_file(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
示例#4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0 or (len(args) == 1 and args[0] == "-"):
        infile = options.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        options.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
示例#5
0
    def run(self, *args, **PARAMS):

        # Custom command to run reference matching tool.
        statement, run_options = self.buildStatement(**PARAMS)

        # Logging
        runfiles = '\t'.join([os.path.basename(x) for x in (self.fastn1, \
                                                            self.fastn2, \
                                                            self.fastn3) if x])
        E.info("Running sortMeRNA for files: {}".format(runfiles))

        P.run(statement, job_options=run_options)

        # Post process results into generic output for downstream tasks.
        statement = self.postProcess(**PARAMS)
        if statement:
            P.run(statement, run_options)
示例#6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if len(unknown) == 0 or (len(unknown) == 1 and unknown[0] == "-"):
        infile = args.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        args.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
def estimateExpression(infiles, outfile):
    '''estimate expression levels.'''

    R.library("affy")

    E.info("reading data")

    raw_data = R.ReadAffy(infiles)

    E.info("RMA normalization")

    eset = R.rma(raw_data)

    R.boxplot(raw_data)
    R.boxplot(eset)

    print(R.as_list(R.assayData(eset)))
示例#8
0
def segmentFixedWidthWindows(infile, window_size, window_shift):
    """return a list of fixed contig sizes."""

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)
    window_shift = window_size
    # at most 50% can be gap
    gap_cutoff = int(window_size // 2)
    segments = []

    while 1:
        ninput += 1
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        seq = cur_record.sequence
        size = len(cur_record.sequence)

        for x in range(0, size, window_shift):
            s = seq[x:x + window_size].upper()
            gc, at = 0, 0
            for c in s:
                if c in "GC":
                    gc += 1
                elif c in "AT":
                    at += 1

            # skip segments containing mostly gaps
            if window_size - (gc + at) > gap_cutoff:
                nskipped += 1
                continue

            segments.append(
                (contig, x, x + window_size, float(gc) / (gc + at)))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped_windows=%i" %
           (ninput, noutput, nskipped))

    return segments
示例#9
0
def make_report():
    ''' Generates html and pdf versions of restructuredText files
        using sphinx-quickstart pre-configured files (conf.py and Makefile).
        Pre-configured files need to be in a pre-existing report directory.
        Existing reports are overwritten.
    '''
    report_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), 'pipeline_report'))
    print('Copying report templates from: {}'.format(report_path))

    if (os.path.exists(report_dir) and os.path.isdir(report_dir)
            and not os.listdir(report_dir)):
        statement = '''cp %(report_path)s/* pipeline_report ;
                       cd {} ;
                       ln -s ../pipeline.yml . ;
                       make html ;
                       ln -sf _build/html/report_pipeline_pq_example.html . ;
                       make latexpdf ;
                       ln -sf _build/latex/pq_example.pdf .
                    '''.format(report_dir)
        E.info('''Building pdf and html versions of your rst files in
                  {}.'''.format(report_dir))
        P.run(statement)

    elif (os.path.exists(report_dir) and os.path.isdir(report_dir)
          and os.listdir(report_dir)):
        sys.exit(''' {} exists, not overwriting. You can manually run:
                       cd {} ;
                       ln -s ../pipeline.yml . ;
                       make html ;
                       ln -sf _build/html/report_XXXX.html . ;
                       make latexpdf ;
                       ln -sf _build/latex/XXXX.pdf .
                       Or delete the folder and re-run make_report
                 '''.format(report_dir))

    else:
        sys.exit(''' The directory "pipeline_report" does not exist.
                     Are the paths correct?
                     Template files were tried to be copied from:
                     {}
                     You can also manually copy files and run "make html" or
                     "make latexpdf".
                 '''.format(report_path))

    return
示例#10
0
def renameChromosomes(iterator, chr_map):

    ninput, noutput, nskipped = 0, 0, 0

    for bed in iterator:
        ninput += 1

        if bed.contig in chr_map.keys():
            bed.contig = chr_map[bed.contig]
        else:
            nskipped += 1
            continue

        noutput += 1
        yield bed

    E.info("ninput = %i, noutput=%i, nskipped=%i" %
           (ninput, noutput, nskipped))
示例#11
0
def runCPC(infile, outfile):
    '''
    run coding potential calculations on lncRNA geneset
    '''
    # farm.py is called from within cpc.sh
    assert iotools.which("farm.py"), \
        "farm.py needs to be in $PATH for cpc to run"
    # Default cpc parameters don't work with later versions of blast
    E.info("Running cpc with blast version:%s" % iotools.which("blastx"))

    result_evidence = P.snip(outfile, ".result") + ".evidence"
    working_dir = "cpc"
    statement = ("%(pipeline_scriptsdir)s/cpc.sh"
                 " %(infile)s"
                 " %(outfile)s"
                 " %(working_dir)s"
                 " %(result_evidence)s")
    P.run()
示例#12
0
def make_mapped_matrix(map_dict, input_frame):
    '''
    return a matrix with integer labels from mapping
    '''

    frame_index = input_frame.index.tolist()
    nindex = len(frame_index)
    ncols = len(input_frame.columns)
    integer_matrix = np.ndarray((nindex, ncols), dtype=np.int32)

    E.info("mapping cluster labels")
    matrix_idx = [h for h, g in enumerate(frame_index)]
    for idx in matrix_idx:
        for col in range(ncols):
            mod = input_frame.iloc[idx][col + 1]
            integer_matrix[idx][col] = map_dict[mod]

    return integer_matrix
示例#13
0
def read_and_randomize_rows(infile, args):
    """read table from stdin and randomize rows, keeping header."""

    c = E.Counter()
    if args.has_headers:
        keep_header = 1
    else:
        keep_header = 0
    for x in range(keep_header):
        c.header += 1
        args.stdout.write(infile.readline())

    lines = infile.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    args.stdout.write("".join(lines))
    c.lines_output = len(lines)
    E.info(c)
示例#14
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--to-drop-single", dest='to_remove_singletons')
    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-drop1", dest="fq_dropped1")

    (options, args) = E.start(parser)

    reads_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    reads_to_remove = set([x.strip() for x in reads_to_remove])

    fastq_out = IOTools.open_file(options.fq_out1, 'w')
    fastq_host = IOTools.open_file(options.fq_dropped1, 'w')

    reads = 0
    dropped_reads = 0
    for read in Fastq.iterate(IOTools.open_file(fastq1)):
        reads += 1
        if read.identifier.split()[0] in reads_to_remove:
            fastq_host.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))
            dropped_reads += 1
        else:
            fastq_out.write("@%s\n%s\n+\n%s\n" %
                            (read.identifier, read.seq, read.quals))

    fastq_out.close()
    fastq_host.close()

    try:
        percent_dropped = dropped_reads / float(reads) * 100
    except ZeroDivisionError:
        percent_dropped = 0.0

    E.info('Dropped %i of %i reads (%f percent)' \
           % (dropped_reads, reads, percent_dropped))
示例#15
0
def renameChromosomes(gffs, chr_map):

    ninput, noutput, nskipped = 0, 0, 0

    for gff in gffs:

        ninput += 1

        if gff.contig in chr_map.keys():
            gff.contig = chr_map[gff.contig]
        else:
            nskipped += 1
            continue

        noutput += 1
        yield gff

    E.info("ninput = %i, noutput=%i, nskipped=%i" %
           (ninput, noutput, nskipped))
示例#16
0
def shiftIntervals(iterator, contigs, offset):
    """shift intervals by a certain offset and ensure size is maintaned even id contig end reached.

    contigs is a dictionary of contig sizes."""

    ninput, noutput = 0, 0
    nskipped_contig, nskipped_range = 0, 0

    for bed in iterator:
        ninput += 1
        if bed.contig not in contigs:
            nskipped_contig += 1
            continue
        # IMS: if we skip intervals off the end of the contig we should skipp ones
        # off the start as well
        if bed.start < 0 or bed.end < 0:
            nskipped_range += 1
            continue
        # IMS: changing >= to > as bed is half-open
        if bed.end > contigs[bed.contig]:
            nskipped_range += 1
            continue
        noutput += 1

        # add offset to each start and end, and adjust for contig length
        l = bed.end - bed.start
        newstart = bed.start + offset
        newend = bed.end + offset
        if newstart < 0:
            newstart = 0
            newend = l
        if newend > contigs[bed.contig]:
            newstart = contigs[bed.contig] - l
            newend = contigs[bed.contig]

        bed.start = newstart
        bed.end = newend

        yield bed

    E.info("ninput=%i, noutput=%i, nskipped_contig=%i, nskipped_range=%i" %
           (ninput, noutput, nskipped_contig, nskipped_range))
示例#17
0
def mergeBAMFiles(infiles, outfile):
    '''merge BAM files from the same experiment using user-defined regex
    For the mapping stages it is beneficial to perform mapping
    seperately for each sequence read infile(s) per sample so that
    the consistency can be checked. However, for downstream tasks,
    the merged :term:`bam` alignment files are required.
    Parameters
    ----------
    infiles : list
       list of :term:`bam` format alignment files
    outfile : str
       Output filename in :term:`bam` format
    '''

    if "merge_pattern_output" not in PARAMS or \
       not PARAMS["merge_pattern_output"]:
        raise ValueError("no output pattern 'merge_pattern_output' specified")

    if len(infiles) == 1:
        if not os.path.isfile(os.path.join(infiles[0], outfile)):
            E.info("%(outfile)s: only one file for merging - creating "
                   "softlink" % locals())
            os.symlink(os.path.basename(infiles[0]), outfile)
            os.symlink(os.path.basename(infiles[0]) + ".bai", outfile + ".bai")
            return
        else:
            E.info("%(outfile)s: only one file for merging - softlink "
                   "already exists" % locals())
            return

    infiles = " ".join(infiles)
    tmp_bam = P.get_temp_filename(".")

    statement = '''
    samtools merge %(tmp_bam)s %(infiles)s >& %(outfile)s_merge.log &&
    samtools sort %(tmp_bam)s -o %(outfile)s &&
    samtools index %(outfile)s
    '''

    job_memory = '20G'

    P.run(statement)
示例#18
0
def runIDROnPooledPseudoreplicates(infiles, outfile):
    """
    Run IDR analysis on pooled pseudoreplicates for each EXPERIMENT
    """
    # set IDR parameters
    chr_table = os.path.join(PARAMS["annotations_dir"],
                             PARAMS["annotations_interface_contigs"])

    # get statement
    statement = IDR.getIDRStatement(infiles[0],
                                    infiles[1],
                                    outfile,
                                    PARAMS["idr_options_overlap_ratio"],
                                    PARAMS["idr_options_ranking_measure"],
                                    chr_table)

    # run
    E.info("applyIDR: processing %s and %s" % (infiles[0], infiles[1]))
    job_memory = "5G"
    P.run()
示例#19
0
    def count(self, filename1, filename2):
        """count overlap between two bed files."""

        E.info("counting started for %s versus %s" % (filename1, filename2))

        idx2 = self.buildIndex(filename2)

        (self.mExons1, self.mExonsOverlapping1, self.mBases1,
         self.mBasesOverlapping1) = self._count(filename1, idx2)

        self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1
        self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1

        idx1 = self.buildIndex(filename1)

        (self.mExons2, self.mExonsOverlapping2, self.mBases2,
         self.mBasesOverlapping2) = self._count(filename2, idx1)

        self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2
        self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2
示例#20
0
    def count(self, filename, track):
        """count overlap between two gtf files."""

        E.info("counting started for %s versus %s" % (filename, track))

        (self.mExons1, self.mExonsOverlapping1, self.mBases1,
         self.mBasesOverlapping1) = self._count(filename, self.mIndices[track])

        self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1
        self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1

        idx = self.buildIndex(filename)

        # count index against index
        (self.mExons2, self.mExonsOverlapping2, self.mBases2,
         self.mBasesOverlapping2) = self._countIndices(self.mIndices[track],
                                                       idx)

        self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2
        self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2
示例#21
0
def getTables(dbname):
    '''
    Retrieves the names of all tables in the database.
    Groups tables into dictionaries by annotation
    '''
    dbh = sqlite3.connect(dbname)
    c = dbh.cursor()
    statement = "SELECT name FROM sqlite_master WHERE type='table'"
    c.execute(statement)
    tables = c.fetchall()
    c.close()
    dbh.close()
    D = {}
    for t in tables:
        tname = t[0].replace("ensemblg2", "").split("$")
        E.info(tname)
        ttype = tname[0]
        D.setdefault(ttype, [])
        D[ttype].append(tname[1])
    return D
示例#22
0
    def __init__(self, filename, *args, **kwargs):

        assert filename is not None,\
            "please supply filename for CounterOverlap"

        Counter.__init__(self, *args, **kwargs)

        self.filename = filename

        E.info("reading intervals from %s" % self.filename)

        self.index = Bed.readAndIndex(iotools.open_file(self.filename, "r"),
                                      per_track=True)

        E.info("read intervals for %s tracks" % len(self.index))

        self.tracks = list(self.index.keys())
        self.headers = []
        for track in self.tracks:
            self.headers.extend(["%s_nover" % track, "%s_bases" % track])
示例#23
0
    def buildGenomeAlignment(infiles, outfile):
        '''build pairwise genomic aligment from axt files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        for infile in infiles:
            E.info("adding %s" % infile)
            statement = '''gunzip < %(infile)s
            | axtToPsl
            /dev/stdin
            %(query)s.sizes
            %(target)s.sizes
            /dev/stdout
            | pslSwap /dev/stdin /dev/stdout
            | gzip >> %(outfile)s
            '''
            P.run()
示例#24
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k",
                      "--keep-header",
                      dest="keep_header",
                      type="int",
                      help="randomize, but keep header in place [%default]")

    parser.set_defaults(keep_header=0)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    inf = options.stdin
    outf = options.stdout
    c = E.Counter()
    for x in range(options.keep_header):
        c.header += 1
        outf.write(inf.readline())

    lines = inf.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    for line in lines:
        outf.write(line)
    c.lines_output = len(lines)

    E.info(c)

    # write footer and output benchmark information.
    E.stop()
示例#25
0
def loadBAMStats(infiles, outfile):
    '''Import bam statistics into SQLite'''

    scriptsdir = PARAMS["general_scriptsdir"]
    header = ",".join(
        [P.snip(os.path.basename(x), ".readstats") for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)
    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | perl -p -e "s/unique/unique_alignments/"
                | cgat table2table --transpose
                | cgat csv2db
                      --allow-empty-file
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s"""
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
        tname = "%s_%s" % (tablename, suffix)

        statement = """cgat combine_tables
                      --header-names=%(header)s
                      --skip-titles
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/%(suffix)s/"
                | cgat csv2db
                      --table=%(tname)s 
                      --allow-empty-file
                >> %(outfile)s """
        P.run()
示例#26
0
def summarizeReadCounts(infiles, outfile):
    '''Calculate the number of reads lost at each step for each sample'''

    with IOTools.open_file(outfile, 'w') as outf:
        outf.write("sample_id\tinput_reads\toutput_reads\tduplicates\t"
                   "adapter_contamination\trRNA\thost\tlow_complexity\t"
                   "duplicates_percent\tadapters_percent\trrna_percent\t"
                   "host_percent\tlow_complexity_perc\tremaining_percent\n")
        for infile in infiles:
            sample_id = P.snip(os.path.basename(infile),
                               '_read_count_summary.tsv')
            E.info('Processing sample %s' % sample_id)

            df = pd.read_table(infile, index_col=0, header=None)
            deadapt = df.loc['deadapt', 1]
            deduped = df.loc['deduped', 1]
            rrna = df.loc['rRNAremoved', 1]
            dehost = df.loc['dehost', 1]
            masked = df.loc['masked', 1]
            input_reads = df.loc['input', 1]

            lost_dup = input_reads - deduped
            lost_adapt = deduped - deadapt
            lost_rrna = deadapt - rrna
            lost_host = rrna - dehost
            lost_mask = dehost - masked

            lost_dup_perc = round(lost_dup / float(input_reads) * 100, 2)
            lost_adapt_perc = round(lost_adapt / float(input_reads) * 100, 2)
            lost_rrna_perc = round(lost_rrna / float(input_reads) * 100, 2)
            lost_host_perc = round(lost_host / float(input_reads) * 100, 2)
            lost_mask_perc = round(lost_mask / float(input_reads) * 100, 2)
            output_perc = round(masked / float(input_reads) * 100, 2)

            outf.write('\t'.join(
                map(str, [
                    sample_id, input_reads, masked, lost_dup, lost_adapt,
                    lost_rrna, lost_host, lost_mask, lost_dup_perc,
                    lost_adapt_perc, lost_rrna_perc, lost_host_perc,
                    lost_mask_perc, output_perc
                ])) + '\n')
示例#27
0
def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
示例#28
0
    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.get_temp_file(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(iotools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast
示例#29
0
def write_config_files(pipeline_path, general_path):
    '''create default configuration files in `path`.
    '''

    paths = [pipeline_path, general_path]
    config_files = ['pipeline.yml']

    for dest in config_files:
        if os.path.exists(dest):
            E.warn("file `%s` already exists - skipped" % dest)
            continue

        for path in paths:
            src = os.path.join(path, dest)
            if os.path.exists(src):
                shutil.copyfile(src, dest)
                E.info("created new configuration file `%s` " % dest)
                break
        else:
            raise ValueError("default config file `%s` not found in %s" %
                             (config_files, paths))
def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = iotools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = iotools.openFile(infile)
    header = inf.readline()
    outf = iotools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()