Python Stats.Summary примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: Stats

Метод/Функция: Summary

Примеров на hotexamples.com: 8

Python Stats.Summary - 8 примеров найдено. Это лучшие примеры Python кода для CGAT.Stats.Summary, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DistributionalParameters(19)

doFDR(12)

Summary(8)

doLogLikelihoodTest(4)

getSignificance(3)

FDRResult(2)

adjustPValues(2)

doFDRPython(2)

computeROC(1)

doChiSquaredTest(1)

doPearsonChiSquaredTest(1)

Пример #1

Показать файл

def writeResults(outfile, results):
    fields = ("wall", "user", "sys", "cuser", "csys", "nchunks")

    outfile.write("host\t%s\n" % "\t".join([
        "%s_%s" % (x, y)
        for x, y in itertools.product(fields,
                                      Stats.Summary().getHeaders())
    ]))

    hosts = results.keys()
    hosts.sort()

    for host in hosts:
        result = results[host]
        outfile.write("%s" % host)
        for f in fields:
            d = [y.__getitem__(f) for y in result]
            outfile.write("\t%s" % Stats.Summary(d))
        outfile.write("\n")

Пример #2

Показать файл

    def __str__(self):

        single_exon_transcripts = 0
        exons_per_transcript = []
        intron_sizes = []
        transcript_lengths = []
        exon_sizes = []

        for x in self.counts_exons_per_transcript.values():

            x.sort()
            x = Intervals.combine(x)
            transcript_lengths.append(x[-1][1] - x[0][0])

            exons_per_transcript.append(len(x))

            for start, end in x:
                exon_sizes.append(end - start)

            if len(x) == 1:
                single_exon_transcripts += 1
                continue

            last_end = x[0][1]
            for start, end in x[1:]:
                intron_sizes.append(start - last_end)
                last_end = end

        return "\t".join(
            map(str, (
                len(self.counts_gene_ids),
                len(self.counts_transcript_ids),
                single_exon_transcripts,
                Stats.Summary(exons_per_transcript),
                Stats.Summary(exon_sizes),
                Stats.Summary(intron_sizes),
                Stats.Summary(transcript_lengths),
            )))

Пример #3

Показать файл

    def process(self, contig, start, end, reads, qualities):

        entry = GTF.Entry()
        entry.start, entry.end = start, end
        entry.gene_id = self.mIdFormat % id
        entry.transcript_id = entry.gene_id
        entry.contig = contig
        entry.feature = "exon"
        entry.source = "maq"

        read_stats = Stats.Summary(reads)

        entry.score = "%5.2f" % read_stats['mean']

        self.mOutFile.write(str(entry) + "\n")

Пример #4

Показать файл

Файл: fastq2table.py Проект: gsc0107/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess the quality "
        "format of the input fastq file. The user can specify the "
        "quality format of the input file using the --guess-format option. "
        "The script will use this format if the "
        "sequence qualities are ambiguous.[default=%default].")

    parser.add_option(
        "--target-format",
        dest="target_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script will convert quality scores to the destination "
        "format unless [default=%default].")

    parser.set_defaults(
        target_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.target_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.target_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write(
            "%s\t%i\t%i\t%s\n" %
            (record.identifier, nfailed, nns, str(Stats.Summary(quals))))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()

Пример #5

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="method to use [t-test=t-test,wilcox=wilcox]",
                      choices=("t-test", "wilcox"))
    parser.add_option("-1",
                      "--infile",
                      dest="filename_input",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("--header",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")

    parser.set_defaults(
        method="t-test",
        filename_input=None,
        header="value",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_input:
        infile = open(options.filename_input, "r")
    else:
        infile = sys.stdin

    values, errors = IOTools.ReadList(infile, map_function=float)
    if options.filename_input:
        infile.close()

    if errors:
        E.warn("errors in input: %s" % ";".join(map(str, errors)))

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.filename_input2:
        infile = open(options.filename_input2, "r")
        values2, errors2 = IOTools.ReadList(infile, map_function=float)
        infile.close()
    else:
        values2 = None

    stat = Stats.Summary(values)

    power, diff_at_power95 = None, None
    if options.method == "t-test":
        if values2:
            result = R.t_test(values, values2, *xargs, **kwargs)
        else:
            result = R.t_test(values, *xargs, **kwargs)
            # compute power of test
            power = R.power_t_test(n=len(values),
                                   delta=abs(stat["mean"]),
                                   sd=stat["stddev"],
                                   sig_level=0.05)['power']
            diff_at_power95 = R.power_t_test(n=len(values),
                                             power=0.95,
                                             sd=stat["stddev"],
                                             sig_level=0.05)['delta']

    if options.method == "wilcox":
        result = R.wilcox_test(values, *xargs, **kwargs)

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key, value in sorted(result.items()):
        if key == "data.name":
            continue
        if key == "p.value":
            options.stdout.write("%s\t%5.2e\n" % (str(key), value))
        else:
            options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    for key, value in stat.items():
        options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    if power:
        options.stdout.write("1-power\t%5.2e\n" % (1.0 - power))
        options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95)

    E.Stop()

Пример #6

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        help=
        "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
        choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for histograms."
                      "")
    parser.add_option("-f",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option(
        "-n",
        "--norm-test",
        dest="norm_test",
        action="store_true",
        help=
        """test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot",
                      dest="plot",
                      action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write(
            "# creating %i samples from normal distribution with mean %f and stddev %f\n"
            % (len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" %
            (len(values1), len(errors1), len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=False,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=True,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created."
            )
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
          )

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()

Пример #7

Показать файл

Файл: gff2histogram.py Проект: gsc0107/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-b", "--bin-size", dest="bin_size", type="string",
                      help="bin size.")

    parser.add_option("--min-value", dest="min_value", type="float",
                      help="minimum value for histogram.")

    parser.add_option(
        "--max-value", dest="max_value", type="float",
        help="maximum value for histogram.")

    parser.add_option(
        "--no-empty-bins", dest="no_empty_bins", action="store_true",
        help="do not display empty bins.")

    parser.add_option(
        "--with-empty-bins", dest="no_empty_bins", action="store_false",
        help="display empty bins.")

    parser.add_option(
        "--ignore-out-of-range", dest="ignore_out_of_range",
        action="store_true",
        help="ignore values that are out of range (as opposed to truncating "
        "them to range border.")

    parser.add_option("--missing-value", dest="missing_value", type="string",
                      help="entry for missing values [%default].")

    parser.add_option("--use-dynamic-bins", dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf", "bed"),
                      help="input file format [%default].")

    parser.add_option("--method", dest="methods", type="choice",
                      action="append",
                      choices=("all", "hist", "stats", "overlaps", "values"),
                      help="methods to apply [%default].")

    parser.add_option("--output-section", dest="output_section", type="choice",
                      choices=("all", "size", "distance"),
                      help="data to compute [%default].")

    parser.set_defaults(
        no_empty_bins=True,
        bin_size=None,
        dynamic_bins=False,
        ignore_out_of_range=False,
        min_value=None,
        max_value=None,
        nonull=None,
        missing_value="na",
        output_filename_pattern="%s",
        methods=[],
        output_section="all",
        format="gff",
    )

    (options, args) = E.Start(parser, add_output_options=True)

    if "all" in options.methods:
        options.methods = ("hist", "stats", "overlaps")
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"

    if len(options.methods) == 0:
        raise ValueError(
            "please provide counting method using --method option")

    if options.format in ("gff", "gtf"):
        gffs = GTF.iterator(options.stdin)
    elif options.format == "bed":
        gffs = Bed.iterator(options.stdin)

    values_between = []
    values_within = []
    values_overlaps = []

    if "overlaps" in options.methods:
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"
        outfile_overlaps = E.openOutputFile("overlaps")
    else:
        outfile_overlaps = None

    last = None
    ninput, noverlaps = 0, 0
    for this in gffs:
        ninput += 1
        values_within.append(this.end - this.start)

        if last and last.contig == this.contig:
            if this.start < last.end:
                noverlaps += 1
                if outfile_overlaps:
                    outfile_overlaps.write("%s\t%s\n" % (str(last), str(this)))
                values_overlaps.append(
                    min(this.end, last.end) - max(last.start, this.start))
                if this.end > last.end:
                    last = this
                continue
            else:
                values_between.append(this.start - last.end)
                # if this.start - last.end < 10:
                #     print str(last)
                #     print str(this)
                #     print "=="
                values_overlaps.append(0)

        last = this

    if "hist" in options.methods:
        outfile = E.openOutputFile("hist")
        h_within = Histogram.Calculate(
            values_within,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        h_between = Histogram.Calculate(
            values_between,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        if "all" == options.output_section:
            outfile.write("residues\tsize\tdistance\n")
            combined_histogram = Histogram.Combine(
                [h_within, h_between], missing_value=options.missing_value)
            Histogram.Write(outfile, combined_histogram, nonull=options.nonull)
        elif options.output_section == "size":
            outfile.write("residues\tsize\n")
            Histogram.Write(outfile, h_within, nonull=options.nonull)
        elif options.output_section == "distance":
            outfile.write("residues\tdistance\n")
            Histogram.Write(outfile, h_between, nonull=options.nonull)

        outfile.close()

    if "stats" in options.methods:
        outfile = E.openOutputFile("stats")
        outfile.write("data\t%s\n" % Stats.Summary().getHeader())
        if options.output_section in ("size", "all"):
            outfile.write("size\t%s\n" % str(Stats.Summary(values_within)))
        if options.output_section in ("distance", "all"):
            outfile.write("distance\t%s\n" %
                          str(Stats.Summary(values_between)))
        outfile.close()

    if "values" in options.methods:
        outfile = E.openOutputFile("distances")
        outfile.write("distance\n%s\n" % "\n".join(map(str, values_between)))
        outfile.close()
        outfile = E.openOutputFile("sizes")
        outfile.write("size\n%s\n" % "\n".join(map(str, values_within)))
        outfile.close()
        outfile = E.openOutputFile("overlaps")
        outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps)))
        outfile.close()

    E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" %
           (ninput,
            len(values_between),
            len(values_within),
            noverlaps))

    E.Stop()

Пример #8

Показать файл

Файл: pipeline_vitaminD_expression.py Проект: logust79/cgat-apps

def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix):
    '''build expression tracks.

    read the analysis from FILENAME_EXPRESSION
    
    ..note::
       The file A589_Data_RMA.csv does NOT always contain the probeset_id 
       in the first column, but instead it might be the transcript_cluster_id.
       A possible explanation is that if several probesets map to the same
       transcript cluster, the transcript cluster is normalized.
       
       The set of cluster_id and probeset ids are completely non-overlapping.

    Hence, the :term:`cluster_id` will be used.
    '''

    E.info("importing expression data from %s" % infile)

    dbhandle = sqlite3.connect(PARAMS["database"])

    cc = dbhandle.cursor()
    statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript"
    cc.execute(statement)
    map_cluster2transcript, map_probeset2cluster = {}, {}
    for probeset, cluster, transcript_id in cc.fetchall():
        map_probeset2cluster[probeset] = cluster
        map_cluster2transcript[cluster] = transcript_id

    reader = csv.reader(open(infile, "rU"))

    first = True
    # do not delete old files as this function is called several times
    output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False)

    headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"),
               ("mRna - Description", "description"), ('mRNA Accession',
                                                       'mrna_id'),
               ('mRNA  Source', 'source'), ('mRNA - xhyb', 'xhyb'),
               ('GO Biological Process ID',
                'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'),
               ('GO Cellular Component ID',
                'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'),
               ('GO Molecular Function ID',
                'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'),
               ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name'))

    old_headers = set([x[0] for x in headers])
    new_headers = [x[1] for x in headers]
    take = []
    index_soure, index_accession, index_probeset = None, None, None
    counts = E.Counter()
    found = set()

    outf = open(outfiles[0] + suffix, "w")
    outf.write("# %s\n" % infile)
    outs = open(outfiles[1] + suffix, "w")
    outs.write("# %s\n" % infile)

    writer = csv.writer(outf)

    for row in reader:
        if first:
            first = False
            writer.writerow(row)

            for x, old_header in enumerate(row):
                if old_header == "mRNA  Source": index_source = len(take)
                if old_header == "mRNA Accession": index_accession = len(take)
                if old_header == "Probe Set ID": index_probeset = len(take)
                if old_header in old_headers: take.append(x)

            # write headers to all files
            outs.write("\t".join(new_headers) + "\n")

            for exp, columns in map_exp2columns.items():
                output_files.write(
                    exp, "\t".join(
                        ("cluster_id", Stats.Summary().getHeader(), "\t".join(
                            ["R%i" % i for i in range(len(columns))]))) + "\n")
        else:
            new_row = []
            for x in take:
                if row[x].strip() != "---":
                    new_row.append(row[x].strip())
                else:
                    new_row.append("")

            probeset = new_row[index_probeset].strip()
            if probeset in map_probeset2cluster:
                probeset = map_probeset2cluster[probeset]
                counter.mapped_to_cluster += 1

            if probeset not in map_cluster2transcript:
                writer.writerow(row)
                counts.skipped += 1
                continue
            else:
                if probeset in found:
                    counts.duplicates += 1
                counts.output += 1
                found.add(probeset)

            outs.write("\t".join(new_row) + "\n")

            for exp, cols in map_exp2columns.items():
                data = [row[x] for x in cols]
                output_files.write(
                    exp, "\t".join(
                        (probeset, str(Stats.Summary(
                            [float(x)
                             for x in data])), "\t".join(data))) + "\n")

    outf.close()
    if counts.duplicates > 0:
        P.warn("duplicate probeset/clusters")

    P.info("probeset source information: %s" % str(counts))
    output_files.close()