Exemplo n.º 1
0
 def test_load_experiment(self):
     mm10tsv = resource_filename(__name__, "experiments-mm10.tsv")
     hg38tsv = resource_filename(__name__, "experiments-hg38.tsv")
     mm10 = models.load_experiments([mm10tsv])
     self.assertEqual(len(mm10), count_valid_records(mm10))
     hg38 = models.load_experiments([hg38tsv])
     both = models.load_experiments([mm10tsv, hg38tsv])
     self.assertEqual(len(mm10) + len(hg38), len(both))
Exemplo n.º 2
0
    def test_load_experiment(self):
        mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv')
        hg38tsv = resource_filename(__name__, 'experiments-hg38.tsv')
        mm10 = models.load_experiments([mm10tsv])
        self.assertIn('replicates', mm10.columns)
        self.assertEqual(len(mm10), count_valid_records(mm10tsv))
        hg38 = models.load_experiments([hg38tsv])
        both = models.load_experiments([mm10tsv, hg38tsv])
        self.assertEqual(len(mm10) + len(hg38), len(both))

        self.assertEqual(mm10.loc['expm']['replicates'],
                         ['12307', '12308'])
Exemplo n.º 3
0
 def test_load_experiments_analysis_root(self):
     with TemporaryDirectory() as analysis_dir:
         with chdir(analysis_dir):
             mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv')
             tmpname = os.path.join(analysis_dir, 'experiments-mm10.tsv')
             shutil.copy(mm10tsv, tmpname)
             analysis_root = os.path.dirname(mm10tsv)
             mm10 = models.load_experiments([mm10tsv])
             mm10tmp = models.load_experiments([tmpname],
                                               analysis_root=analysis_root)
             for i in mm10['analysis_dir'].index:
                 self.assertEqual(mm10['analysis_dir'][i],
                                  mm10tmp['analysis_dir'][i])
Exemplo n.º 4
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    sep = get_seperator(args.sep)
    if args.experiments:
        experiments = models.load_experiments([args.experiments], sep=sep)
    else:
        if args.experiment_name is None:
            parser.error(
                "Please provide an experiment name. (Used as filename)")
        if len(args.replicates) == 0:
            parser.error(
                "Please provide list of replicates or experiment table")
        experiments = {args.experiment_name: args.replicates}

    if args.library is None:
        parser.error("Please provide library information tables")

    for experiment_name in experiments:
        replicates = experiments[experiment_name]
        logging.info('Processing:', experiment_name, ','.join(replicates))
        create_quantification_cache(
            args.library,
            experiment_name,
            replicates,
            args.quantification,
            sep)
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument('-n',
                        '--experiment-name',
                        required=True,
                        help='Experiment name to select')
    add_metadata_arguments(parser)
    add_debug_arguments(parser)
    args = parser.parse_args(cmdline)

    configure_logging(args)

    header_printed = False
    libraries = load_library_tables(args.libraries)
    experiments = load_experiments(args.experiments)

    replicates = experiments.loc[args.experiment_name, 'replicates']

    for i, (library_id,
            library) in enumerate(libraries.loc[replicates].iterrows()):
        filename = find_library_bam_file(library)
        LOGGER.info('  Reading %s %d/%d', filename, i + 1, len(replicates))

        mode = get_mode(filename, 'r')
        with pysam.AlignmentFile(filename, mode) as alignment:
            if not header_printed:
                print(str(alignment.header))
                header_printed = True

            for read in alignment:
                print(read.to_string())
Exemplo n.º 6
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbose:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARN)

    sep = get_seperator(args.sep)
    if args.experiments:
        experiments = models.load_experiments(args.experiments,
                                              sep=sep,
                                              analysis_root=args.root)
    else:
        if args.experiment_name is None:
            parser.error(
                "Please provide an experiment name. (Used as filename)")
        if len(args.replicates) == 0:
            parser.error(
                "Please provide list of replicates or experiment table")
        experiments = {args.experiment_name: args.replicates}

    if args.libraries is None:
        parser.error("Please provide library information tables")

    libraries = models.load_library_tables(args.libraries, sep=sep)

    for i, experiment in experiments.iterrows():
        logging.info('Processing: %s', experiment.name)
        create_quantification_cache(experiment, libraries, args.quantification,
                                    args.model, sep)
Exemplo n.º 7
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    if args.verbose:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.ERROR)

    experiments = models.load_experiments(args.experiments)
    libraries = models.load_library_tables(args.libraries)
    coverage = models.load_all_coverage(libraries)

    if args.all_experiments:
        make_combined_median_normalized_summary(experiments, coverage,
                                                args.output_format, args.bare)
    elif args.experiment_median_summary:
        make_per_experiment_median_normalized_summary(experiments, coverage,
                                                      args.output_format,
                                                      args.bare)
    elif args.by_experiment:
        make_by_experiment_median_summary(experiments, coverage,
                                          args.output_format, args.bare)
    elif args.combined_median_summary:
        make_combined_experiment_median_summary(experiments, coverage,
                                                args.output_format, args.bare)
    else:
        make_experiment_by_library_coverage_plots(experiments, coverage,
                                                  args.output_format,
                                                  args.bare)
Exemplo n.º 8
0
    def test_create_quantification_cache_tempdir(self):
        with tempfile.TemporaryDirectory() as tempdir:
            temp_experiments = models.load_experiments([self.exp_tsv],
                                                       analysis_root=tempdir)
            quant = 'FPKM'
            score_filename = models.make_correlation_filename(
                temp_experiments.iloc[0])
            quant_filename = models.make_quantification_filename(
                temp_experiments.iloc[0], quant, 'gene')

            print(temp_experiments)
            print(tempdir, score_filename)
            self.assertTrue(score_filename.startswith(tempdir))
            self.assertTrue(quant_filename.startswith(tempdir))

            assert not os.path.exists(score_filename)
            assert not os.path.exists(quant_filename)
            cache = madqc.create_quantification_cache(temp_experiments.iloc[0],
                                                      self.libraries, quant,
                                                      'gene')

            self.assertIsInstance(cache['rafa_spearman'], pandas.DataFrame)
            self.assertTrue(os.path.exists(score_filename))
            os.remove(score_filename)
            self.assertTrue(os.path.exists(quant_filename))
            os.remove(quant_filename)
Exemplo n.º 9
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    experiments = load_experiments(args.experiments)
    #libraries = load_library_tables(args.libraries)
    return ScoreCorrelationPlot(experiments)
Exemplo n.º 10
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    configure_logging(args)

    sep = get_seperator(args.sep)
    experiments = models.load_experiments(args.experiments, sep=sep)
    libraries = models.load_library_tables(args.libraries, sep=sep)

    output_sep = get_seperator(args.output_format)
    output_extension = {"TAB": ".tsv", ",": ".csv"}[args.output_format]

    if args.transcriptome:
        # isoforms
        load_quantifications = madqc.load_transcriptome_quantifications
        quantification_extension = "_isoform_" + args.quantification + output_extension
    else:
        # genes
        load_quantifications = madqc.load_genomic_quantifications
        quantification_extension = "_gene_" + args.quantification + output_extension

    for name in experiments:
        filename = name + quantification_extension
        replicates = experiments[name]
        logger.info("%s %s: %s", name, args.quantification, ",".join(replicates))
        quantifications = load_quantifications(replicates, libraries, args.quantification)
        quantifications.to_csv(filename, sep=output_sep)
Exemplo n.º 11
0
def load_asof_run17_experiments():
    experiment_files = list(split_files_text(ASOF_RUN17_experiment_files))
    experiments = models.load_experiments(experiment_files)

    for experiment_name, row in experiments.iterrows():
        row.replicates = [sanitize_library_name(x) for x in row.replicates]

    return experiments
Exemplo n.º 12
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    experiments = load_experiments(args.experiments)
    libraries = load_library_tables(args.libraries)

    plot = MeanGeneCoverage(experiments, libraries)
    plot.use_experiment(args.use_experiment)
    return plot
Exemplo n.º 13
0
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument('-s', '--sheet', default=0, help='Sheet to use')
    parser.add_argument('--header', default=None, help="header row")
    parser.add_argument('filename', nargs=1, help='spreadsheet to look at')
    args = parser.parse_args(cmdline)

    header = int(args.header) if args.header is not None else None
    book = ODFReader(args.filename[0])
    data = book.parse(args.sheet, header=header)

    server = ENCODED('www.encodeproject.org')
    server.load_netrc()

    first_experiments = models.load_experiments(
        to_files(paper_433_experiment_files))
    all_experiments = models.load_experiments(
        to_files(ASOF_RUN17_experiment_files))

    first_libraries = set(parse_replicates(first_experiments['replicates']))
    all_libraries = set(parse_replicates(all_experiments['replicates']))

    #print(first_libraries)
    #print(all_libraries)
    results = []
    for i, library_id in enumerate(data[data.columns[0]]):
        if library_id in first_libraries:
            tranche = 1
        elif library_id in all_libraries:
            tranche = 2
        else:
            tranche = 'C'

        row = find_library_info(server, library_id)
        row['tranche'] = tranche
        results.append(row)

        if (i + 1) % 10:
            print('.', end='', flush=True)

    df = pandas.DataFrame(results)
    df.to_csv('tranche.csv', index=False)
Exemplo n.º 14
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    experiments = load_experiments(args.experiments)
    libraries = load_library_tables(args.libraries)
    if args.use_experiment:
        try:
            experiments = experiments.loc[[args.use_experiment]]
        except KeyError:
            print('{} was not found in {}'.format(args.use_experiment, ', '.join(list(experiments.index))))
            return None
    plot = DistributionPlot(experiments, libraries)
    return plot
Exemplo n.º 15
0
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument('-o', '--output', help='output directory')
    parser.add_argument('--mode',
                        default=None,
                        choices=[
                            'customtrack',
                            'trackhub',
                            'merge_paper_wiggles',
                            'paper_median_coverage',
                            'check_bedgraphs',
                            'localize_tsvs',
                            'paper_as_single_experiment_tsv',
                            'paper_as_cluster_experiment_tsv',
                        ])
    args = parser.parse_args(cmdline)

    experiment_files = [
        os.path.expanduser(x.strip())
        for x in ASOF_RUN17_experiment_files.split()
    ]
    library_files = [
        os.path.expanduser(x.strip())
        for x in ASOF_RUN17_library_files.split()
    ]

    experiments = models.load_experiments(experiment_files)
    libraries = models.load_library_tables(library_files)

    to_include = read_peng_20180710_cluster_memberships()
    #print('{} cells to include'.format(len(to_include)))

    if args.mode == 'customtrack':
        make_custom_tracks()
    elif args.mode == 'trackhub':
        make_trackhub()
    elif args.mode == 'merge_paper_wiggles':
        merge_paper_wiggles(to_include, libraries)
    elif args.mode == 'paper_median_coverage':
        make_paper_median_coverage(to_include, libraries, args.output)
    elif args.mode == 'check_bedgraphs':
        check_bedgraphs(to_include, libraries)
    elif args.mode == 'localize_tsvs':
        localize_tsvs(experiments, libraries, args.output)
    elif args.mode == 'paper_as_single_experiment_tsv':
        paper920_as_single_experiment_tsv(to_include, args.output)
    elif args.mode == 'paper_as_cluster_experiment_tsv':
        paper920_as_cluster_experiment_tsv(to_include, args.output)
    else:
        parser.error('Did you want to pick an operation mode?')
Exemplo n.º 16
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    configure_logging(args)

    sep = get_seperator(args.sep)
    experiments = models.load_experiments(args.experiments, sep=sep)
    libraries = models.load_library_tables(args.libraries, sep=sep)

    output_sep = get_seperator(args.output_format)
    output_extension = {
        'TAB': '.tsv',
        ',': '.csv',
    }[args.output_format]

    if args.add_names:
        if args.gtf_cache is None:
            parser.error('GTF-cache is needed to add names to the quantification file')
        else:
            logger.info('Loading GTF Cache %s', args.gtf_cache)
            annotation = models.load_gtf_cache(args.gtf_cache)
    else:
        annotation = None

    if args.transcriptome:
        # isoforms
        load_quantifications = madqc.load_transcriptome_quantifications
        lookup_ids = models.lookup_gene_name_by_transcript_id
        quantification_extension = '_isoform_' + args.quantification + output_extension
    else:
        # genes
        load_quantifications = madqc.load_genomic_quantifications
        lookup_ids = models.lookup_gene_name_by_gene_id
        quantification_extension = '_gene_' + args.quantification + output_extension

    for name in experiments:
        filename = name + quantification_extension
        replicates = experiments[name]
        logger.info("%s %s: %s",
                    name, args.quantification, ','.join(replicates))
        quantifications = load_quantifications(
            replicates, libraries, args.quantification)

        if annotation is not None:
            quantifications = lookup_ids(annotation, quantifications)

        quantifications.to_csv(filename, sep=output_sep)
Exemplo n.º 17
0
    def test_make_quantification_filename_other(self):
        """Does make_quantification_filename work with an alternate analysis_root
        """
        results = {
            'genome': 'expf_FPKM.h5',
            'transcriptome': 'expf_transcriptome_FPKM.h5',
        }
        for reference_type in results:
            mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv')
            path = '/tmp'
            mm10 = models.load_experiments([mm10tsv], analysis_root=path)

            filename = models.make_quantification_filename(
                mm10.iloc[0],
                reference_type=reference_type,
            )
            expected = os.path.join(path, results[reference_type])
            self.assertEqual(filename, expected)
Exemplo n.º 18
0
    def test_make_correlation_filename_default(self):
        """Does make_correlation_filename work with default analysis_root
        """
        results = {
            'genome': 'expf_correlation.h5',
            'transcriptome': 'expf_transcriptome_correlation.h5',
        }
        for reference_type in results:
            mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv')
            path, _ = os.path.split(mm10tsv)
            mm10 = models.load_experiments([mm10tsv])

            filename = models.make_correlation_filename(
                mm10.iloc[0],
                reference_type=reference_type,
            )
            expected = os.path.join(path, results[reference_type])
            self.assertEqual(filename, expected)
def load_filtered_transcripts():
    sep = '\t'

    cache_file = os.path.expanduser(
        '~sau/genomes/mm10-M4-male/mm10-M4-male.h5')
    #annotation = models.load_gtf_cache(cache_file)
    annotation = None

    loader = IsoformRsemLoader('FPKM', annotation)
    index_name = 'transcript_id'
    # loader = GeneRsemLoader(args.quantification, annotation)
    #index_name = 'gene_id'

    to_include = generate_to_include_asof_run17()[1:]

    experiment_files = [
        os.path.expanduser(x.strip())
        for x in ASOF_RUN17_experiment_files.split()
    ]
    library_files = [
        os.path.expanduser(x.strip())
        for x in ASOF_RUN17_library_files.split()
    ]

    quantifications = []
    for e, l in zip(experiment_files, library_files):
        print('loading', e)
        experiments = models.load_experiments([e], sep=sep)
        libraries = models.load_library_tables([l], sep=sep)
        for i, experiment in experiments.iterrows():
            print(experiment)
            quantification = loader.load(experiment, libraries)
            quantification.columns = list(
                filter_columns(quantification.columns))
            quantifications.append(quantification)

    sheets = pandas.concat(quantifications, axis=1)

    print('all', sheets.shape)
    # sheets.to_csv('C1_mouse_combined_transcript_asof_run17_unfiltred.tsv', sep='\t')
    # was crashing because of _mm10 suffix
    filtered = sheets[to_include]
    print('filtered', filtered.shape)
    return filtered
Exemplo n.º 20
0
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument('-o', '--output-dir')
    args = parser.parse_args(cmdline)

    experiment_files = [
        os.path.expanduser(x.strip())
        for x in ASOF_RUN17_experiment_files.split()
    ]
    library_files = [
        os.path.expanduser(x.strip())
        for x in ASOF_RUN17_library_files.split()
    ]

    experiments = load_experiments(experiment_files)
    libraries = load_library_tables(library_files)

    #link_rsem(libraries, args.output_dir)
    link_genome_bams(libraries, args.output_dir)
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument('--first-tranche', default=False, action='store_true',
                        help='Use just the first tranche as experiment list')
    parser.add_argument('--name', required=True, help='submission name')
    parser.add_argument('-s', '--sheet', default=0, help='Sheet to use')
    parser.add_argument('--header', default=None, help="header row")
    parser.add_argument('filename', nargs=1, help='driver spreadsheet')
    args = parser.parse_args(cmdline)
    root_fastq_url = 'http://jumpgate.caltech.edu/runfolders/volvox02/'
    desplit = os.path.expanduser('~/proj/htsworkflow/htsworkflow/pipelines/desplit_fastq.py')

    header = int(args.header) if args.header is not None else None
    data = read_spreadsheet(args.filename[0], args.sheet, header)
    print(data.shape)

    if args.first_tranche:
        experiment_file_list = paper_433_experiment_files.split('\n')
    else:
        experiment_file_list = ASOF_RUN17_experiment_files.split('\n')
    experiment_files = [ os.path.expanduser(x.strip()) for x in  experiment_file_list]
    experiments = load_experiments(experiment_files)
    experiments['replicates'] = experiments['replicates'].apply(lambda l: [x.replace('_mm10', '').replace('_clean', '') for x in l])

    current_experiments = find_experiments_to_submit(experiments, data)

    aliases_tsv = '{}-aliases.tsv'.format(args.name)
    make_library_aliases(current_experiments, aliases_tsv)

    submission_fastqs_tsv = '{}-fastqs.tsv'.format(args.name)
    if not os.path.exists(submission_fastqs_tsv):
        fastq_urls = find_all_fastqs(root_fastq_url, current_experiments, submission_fastqs_tsv)

    fastq_urls = pandas.read_csv(submission_fastqs_tsv, sep='\t')

    barcodes_tsv = '{}-barcodes.tsv'.format(args.name)
    make_library_barcodes(fastq_urls, barcodes_tsv)

    metadata_tsv = '{}-flowcell-details.tsv'.format(args.name)
    metadata = make_metadata(fastq_urls, root_fastq_url, metadata_tsv)

    merge_file = '{}-merge-fastqs.condor'.format(args.name)
    make_desplit_condor(fastq_urls, metadata, desplit, root_fastq_url, merge_file)
Exemplo n.º 22
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    configure_logging(args)

    experiments = load_experiments(args.experiments)
    libraries = load_library_tables(args.libraries)
    if args.use_experiment:
        try:
            experiments = experiments.loc[[args.use_experiment]]
        except KeyError:
            logger.error('{} was not found in {}'.format(
                args.use_experiment, ', '.join(list(experiments.index))))
            return None

    if len(args.gene_type_filter) > 0:
        logger.info('Limiting to the following gene types {}'.format(','.join(
            args.gene_type_filter)))
    else:
        logger.info('Using all gene types')

    # ids will be None if args.gene_list_filter is None
    ids = load_gene_id_list(args.gene_list_filter)

    plot = GenesDetectedPlot(
        experiments,
        libraries,
        args.genome_dir,
        args.quantification,
        gene_type_filter=args.gene_type_filter,
        gene_list_filter=ids,
    )

    if __name__ == '__main__':
        curdoc().add_root(plot.static_layout())
        save(curdoc(), args.output, title=plot.title)

    return plot
Exemplo n.º 23
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    configure_logging(args)

    sep = get_seperator(args.sep)
    experiments = models.load_experiments(args.experiments, sep=sep)
    libraries = models.load_library_tables(args.libraries, sep=sep)

    gtf_cache = None
    if args.add_names:
        if args.genome_dir is None:
            parser.error(
                'genome-dir is needed to add names to the quantification file')
        else:
            gtf_cache = GTFCache(libraries, args.genome_dir)

    if len(args.quantification) > 0:
        quantification_list = args.quantification
    else:
        quantification_list = ['FPKM']

    if args.transcriptome:
        # isoforms
        RsemLoader = IsoformRsemLoader
    else:
        # genes
        RsemLoader = GeneRsemLoader

    for quantification in quantification_list:
        logger.info('Building expression matrix for %s', quantification)
        for i, experiment in experiments.iterrows():
            loader = RsemLoader(quantification, gtf_cache)
            matrix = loader.load(experiment, libraries)
            loader.save(matrix, args.output_format)
Exemplo n.º 24
0
def main(cmdline=None):
    parser = make_parser()
    args = parser.parse_args(cmdline)

    configure_logging(args)

    sep = get_seperator(args.sep)
    experiments = models.load_experiments(args.experiments, sep=sep)
    libraries = models.load_library_tables(args.libraries, sep=sep)

    if args.add_names:
        if args.gtf_cache is None:
            parser.error('GTF-cache is needed to add names to the quantification file')
        else:
            logger.info('Loading GTF Cache %s', args.gtf_cache)
            annotation = models.load_gtf_cache(args.gtf_cache)
    else:
        annotation = None

    loader = StarLoader(args.strand, annotation)

    for i, experiment in experiments.iterrows():
        quantification = loader.load(experiment, libraries)
        loader.save(quantification, args.output_format)
Exemplo n.º 25
0
 def setUp(self):
     self.exp_tsv = resource_filename(__name__, 'experiments-mm10.tsv')
     self.lib_tsv = resource_filename(__name__, 'library-mm10-se.tsv')
     self.libraries = models.load_library_tables([self.lib_tsv])
     self.experiments = models.load_experiments([self.exp_tsv])
Exemplo n.º 26
0
 def test_load_numeric_experiment(self):
     filename = resource_filename(__name__, 'experiments-numeric.tsv')
     experiment = models.load_experiments([filename])
     for name in experiment.index:
         self.assertIsInstance(name, str)
Exemplo n.º 27
0
 def __init__(self, experiments, sep='\t', analysis_root=None):
     self.name = None
     self.experiments = load_experiments(experiments, sep)
     self.quantification_name = None
     self.quantification = None