Пример #1
0
    def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None):
        info('Parsing the NextSeq500 project structure')
        self.kind = 'nextseq500'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)
        info('az_prjname_by_subprj: ' + str(az_prjname_by_subprj))

        verify_dir(self.unaligned_dirpath, is_critical=True)

        for pname, project in self.project_by_name.items():
            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(self.unaligned_dirpath, az_proj_name)
            for sample in project.sample_by_name.values():
                sample.source_fastq_dirpath = project.dirpath
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

        self.basecall_stat_html_reports = self.__get_basecall_stats_reports()

        self.get_fastq_regexp_fn = get_nextseq500_regexp
Пример #2
0
    def set_dirpath(self, dirpath, az_project_name):
        self.dirpath = dirpath
        self.az_project_name = az_project_name
        verify_dir(self.dirpath, is_critical=True)

        merged_dirpath = join(self.dirpath, 'merged')
        if verify_dir(merged_dirpath, silent=True):
            self.mergred_dir_found = True
            self.fastq_dirpath = self.fastqc_dirpath = merged_dirpath
        else:
            self.mergred_dir_found = False
            self.fastq_dirpath = join(self.dirpath, 'fastq')
            self.fastqc_dirpath = join(self.fastq_dirpath, 'FastQC')
        info()

        self.comb_fastqc_fpath = join(self.fastqc_dirpath, 'FastQC.html')
        self.downsample_targqc_report_fpath = None
        self.project_report_html_fpath = None

        self.downsample_metamapping_dirpath = join(self.dirpath,
                                                   'Downsample_MetaMapping')
        self.downsample_targqc_dirpath = join(self.dirpath,
                                              'Downsample_TargQC')
        self.downsample_targqc_report_fpath = join(
            self.downsample_targqc_dirpath, 'targQC.html')
        self.project_report_html_fpath = join(self.dirpath,
                                              az_project_name + '.html')
Пример #3
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Пример #4
0
    def __init__(self, dirpath, az_prjname_by_subprj=None, samplesheet=None):
        info('Parsing the HiSeq project structure')
        self.kind = 'hiseq'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)

        verify_dir(self.unaligned_dirpath, is_critical=True)

        self.basecall_stat_html_reports = self.__get_basecall_stats_reports()

        for pname, project in self.project_by_name.items():
            proj_dirpath = join(
                self.unaligned_dirpath, 'Project_' + pname.replace(
                    ' ', '-'))  #.replace('-', '_').replace('.', '_'))

            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(proj_dirpath, az_proj_name)
            for sname, sample in project.sample_by_name.items():
                sample.source_fastq_dirpath = join(
                    project.dirpath, 'Sample_' + sname.replace(
                        ' ', '-'))  #.replace('-', '_').replace('.', '_'))
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

            basecalls_symlink = join(project.dirpath, 'BaseCallsReports')
            if not exists(basecalls_symlink):
                info('Creating BaseCalls symlink ' + self.basecalls_dirpath +
                     ' -> ' + basecalls_symlink)
                try:
                    os.symlink(self.basecalls_dirpath, basecalls_symlink)
                except OSError:
                    err('Cannot create symlink')
                    traceback.print_exc()
                else:
                    info('Created')
            if exists(basecalls_symlink):
                self.basecalls_dirpath = basecalls_symlink

        self.get_fastq_regexp_fn = get_hiseq_regexp
Пример #5
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option(
        '--downsample-to',
        dest='downsample_to',
        default=None,
        type='int',
        help=
        'Downsample reads to avoid excessive processing times with large files. '
        'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    cnf = Config(opts.__dict__, determine_sys_cnf(opts),
                 determine_run_cnf(opts))
    left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True)
    right_reads_fpath = verify_file(opts.right_reads_fpath, is_critical=True)
    output_dirpath = adjust_path(
        opts.output_dir) if opts.output_dir else critical(
            'Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath),
               description='output_dir',
               is_critical=True)

    with workdir(cnf):
        sample_name = cnf.sample_name
        if not sample_name:
            sample_name = _get_sample_name(left_reads_fpath, right_reads_fpath)
        results_dirpath = run_fastq(cnf,
                                    sample_name,
                                    left_reads_fpath,
                                    right_reads_fpath,
                                    output_dirpath,
                                    downsample_to=cnf.downsample_to)

    verify_dir(results_dirpath, is_critical=True)
    info()
    info('*' * 70)
    info('Fastqc results:')
    info('  ' + results_dirpath)
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int',
        help='Downsample reads to avoid excessive processing times with large files. '
            'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()

    if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir:
        parser.print_usage()

    verify_file(opts.left_reads_fpath, is_critical=False)
    left_reads_fpath = adjust_path(opts.left_reads_fpath)
    verify_file(opts.right_reads_fpath, is_critical=False)
    right_reads_fpath = adjust_path(opts.right_reads_fpath)
    output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True)

    left_reads_fpath, right_reads_fpath, output_dirpath =\
        map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath])

    ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******')
    fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py')
    fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM)
    fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM)

    cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}'
    if opts.sample_name:
        cmdl += ' --sample {opts.sample_name}'
    if opts.downsample_to:
        cmdl += ' --downsample-to ' + str(int(opts.downsample_to))
    cmdl = cmdl.format(**locals())
    cmdl += ' 2>&1'
    info(cmdl)
    stdin, stdout, stderr = ssh.exec_command(cmdl)
    for l in stdout:
        err(l, ending='')
    info()
    ssh.close()
def check_dirs_and_files(cnf, file_keys=list(), dir_keys=list()):
    errors = []

    def _verify_input_file(_key):
        cnf[_key] = adjust_path(cnf[_key])
        if not verify_file(cnf[_key], _key):
            return False
        if 'bam' in _key and not verify_bam(cnf[_key]):
            return False
        if 'bed' in _key and not verify_bed(cnf[_key]):
            return False
        return True

    for key in file_keys:
        if key and key in cnf and cnf[key]:
            if not _verify_input_file(key):
                errors.append('File ' + cnf[key] +
                              ' is empty or cannot be found')
            else:
                cnf[key] = adjust_path(cnf[key])

    for key in dir_keys:
        if key and key in cnf and cnf[key]:
            cnf[key] = adjust_path(cnf[key])
            if not verify_dir(cnf[key], key):
                errors.append('Directory ' + cnf[key] +
                              ' is empty or cannot be found')
            else:
                cnf[key] = adjust_path(cnf[key])

    return errors
Пример #8
0
 def __find_unaligned_dir(self):
     unaligned_dirpath = join(self.dirpath, 'Unalign')
     if verify_dir(unaligned_dirpath,
                   description='"Unalign" directory',
                   silent=True):
         unaligned_dirpath = unaligned_dirpath
     else:
         unaligned_dirpath = None
         warn('No unalign directory')
     return unaligned_dirpath
Пример #9
0
    def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None):
        self.az_prjname_by_subprj = az_prjname_by_subprj

        illumina_project_name = None
        if '/Unalign/' in dirpath:
            self.dirpath = dirpath.split('/Unalign/')[0]
            self.unaligned_dirpath = self.__find_unaligned_dir()
            verify_dir(self.unaligned_dirpath,
                       description='Unalign dir',
                       is_critical=True)
            illumina_project_name = dirpath.split(
                '/Unalign/'
            )[1]  # something like AURA.FFPE.AZ300, in contast with project_name which is something like Bio_123_AURA_FFPE_AZ300
            info('Processing sub-project ' + illumina_project_name)
        else:
            self.dirpath = dirpath
            self.unaligned_dirpath = self.__find_unaligned_dir()

        self.basecalls_dirpath = join(self.dirpath,
                                      'Data/Intensities/BaseCalls')
        verify_dir(self.basecalls_dirpath, is_critical=True)

        self.bcl2fastq_dirpath = None
        self.source_fastq_dirpath = None

        if samplesheet:
            self.samplesheet_fpath = samplesheet
        else:
            self.samplesheet_fpath = self.__find_sample_sheet()
        self.project_by_name = self._parse_sample_sheet(self.samplesheet_fpath)

        if illumina_project_name:  # we want only a specific project
            if illumina_project_name not in self.project_by_name:
                info()
                critical('Err: project ' + illumina_project_name +
                         ' not in the SampleSheet ' + self.samplesheet_fpath)
            else:
                self.project_by_name = {
                    illumina_project_name:
                    self.project_by_name[illumina_project_name]
                }
Пример #10
0
    def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None):
        info('Parsing the MiSeq project structure')
        self.kind = 'miseq'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)

        base_dirpath = self.unaligned_dirpath
        if not verify_dir(base_dirpath, silent=True):
            base_dirpath = self.basecalls_dirpath
        verify_dir(base_dirpath, description='Source fastq dir')

        for pname, project in self.project_by_name.items():
            proj_dirpath = join(base_dirpath, pname)
            if not verify_dir(proj_dirpath, silent=True):
                proj_dirpath = base_dirpath

            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(proj_dirpath, az_proj_name)
            for sample in project.sample_by_name.values():
                sample.source_fastq_dirpath = project.dirpath
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

        self.basecall_stat_html_reports = []

        self.get_fastq_regexp_fn = get_hiseq4000_miseq_regexp
Пример #11
0
def proc_opts():
    parser = OptionParser(description='')
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug
    if len(args) < 1:
        critical('First argument should be a root datasets dir')
    # if len(args) < 2:
    #     info('No dataset path specified, assuming it is the current working directory')
    #     dataset_dirpath = adjust_path(os.getcwd())
    #     jira_url = args[0]
    root_dirpath = verify_dir(args[0], is_critical=True, description='Dataset directory')  # /ngs/oncology/datasets/hiseq/150521_D00443_0159_AHK2KTADXX

    info(' '.join(sys.argv))

    return root_dirpath
Пример #12
0
def create_jbrowse_symlink(genome, project_name, sample, file_fpath):
    jbrowse_data_path, _, _ = set_folders(genome)
    jbrowse_dirpath = join(jbrowse_data_path, 'tracks')
    jbrowse_project_dirpath = join(jbrowse_dirpath, project_name)
    base, ext = splitext_plus(file_fpath)
    if ext in ['.tbi', '.bai']:
        base, ext2 = splitext_plus(base)
        ext = ext2 + ext
    sym_link = join(jbrowse_project_dirpath, sample + ext)
    if not verify_dir(jbrowse_project_dirpath):
        safe_mkdir(jbrowse_project_dirpath)
    if isfile(file_fpath) and not isfile(sym_link):
        try:
            os.symlink(file_fpath, sym_link)
        except OSError:
            warn(traceback.format_exc())
    if isfile(sym_link):
        change_permissions(sym_link)
    return sym_link
Пример #13
0
def _run_multisample_qualimap(cnf, output_dir, samples, targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    if cnf.reuse_intermediate and verify_dir(plots_dirpath) and [
            f for f in listdir(plots_dirpath) if not f.startswith('.')
    ]:
        info('Qualimap miltisample plots exist - ' + plots_dirpath +
             ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len(
            [s.qualimap_html_fpath
             for s in samples if s.qualimap_html_fpath]) > 0:
            qualimap = get_system_path(cnf,
                                       interpreter_or_name=None,
                                       name='qualimap')

            if qualimap is not None and get_qualimap_type(qualimap) == 'full':
                qualimap_output_dir = join(cnf.work_dir,
                                           'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(cnf, samples)
                _correct_qualimap_insert_size_histogram(cnf, samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(
                    rows,
                    join(qualimap_output_dir,
                         'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir,
                                              'images_multisampleBamQcReport')
                cmdline = '{qualimap} multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(
                    **locals())
                res = call(cnf,
                           cmdline,
                           exit_on_error=False,
                           return_err_code=True,
                           env_vars=dict(DISPLAY=None),
                           output_fpath=qualimap_plots_dirpath,
                           output_is_dir=True)
                if res is None or not verify_dir(qualimap_plots_dirpath):
                    warn(
                        'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.'
                    )
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn(
                    'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.'
                )
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def main(args):
    if len(args) < 2:
        sys.exit('Usage ' + __file__ +
                 ' input.tsv bcbio.csv [dir_with_bams] [bina_dir]')

    inp_fpath = args[0]
    verify_file(args[0], is_critical=True)

    out_fpath = args[1]
    verify_dir(dirname(adjust_path(out_fpath)), is_critical=True)

    bam_dirpath = None
    if len(args) > 2:
        bam_dirpath = args[2]
        verify_dir(adjust_path(bam_dirpath), is_critical=True)

    # bam_opt = args[2]
    # try:
    #     bam_col = int(bam_opt)
    #     bam_dirpath = None
    # except ValueError:
    #     bam_col = None
    #     verify_dir(bam_opt, is_critical=True)
    #     bam_dirpath = args[2]

    bina_dirpath = None
    if len(args) > 3:
        bina_dirpath = args[3]
        verify_dir(dirname(adjust_path(bina_dirpath)), is_critical=True)

    # filtered_bams_dirpath = adjust_path(sys.argv[3])
    # verify_dir(join(filtered_bams_dirpath, os.pardir), is_critical=True)

    columns_names = 'study	barcode	disease	disease_name	sample_type	sample_type_name	analyte_type	library_type	center	center_name	platform	platform_name	assembly	filename	 files_size 	checksum	analysis_id	aliquot_id	participant_id	sample_id	tss_id	sample_accession	published	uploaded	modified	state	reason'

    samples_by_patient = defaultdict(list)

    delim = '\t'
    barcode_col = 1
    bam_col = 13
    is_tcga_tsv = True

    with open(inp_fpath) as fh:
        for i, l in enumerate(fh):
            if not l.strip():
                continue

            if i == 0:
                if len(l.split('\t')) == 27:
                    err('Interpreting as TCGA tsv')
                    if l.split('\t')[0] != 'TCGA': continue  # skipping header
                else:
                    delim = None
                    for j, f in enumerate(l.split()):
                        if f.startswith('TCGA'):
                            barcode_col = j
                            err('barcode col is ' + str(j))
                        if f.endswith('bam'):
                            bam_col = j
                            err('bam col is ' + str(j))
                    is_tcga_tsv = False

            fs = l.split(delim)

            barcode = fs[barcode_col].split(
                '-')  # TCGA-05-4244-01A-01D-1105-08

            sample = Sample()
            sample.bam = fs[bam_col]
            sample.bam_base_name = basename(os.path.splitext(fs[bam_col])[0])
            sample.description = fs[barcode_col]
            sample.patient = '-'.join(barcode[:3])
            if is_tcga_tsv:
                sample.reason = fs[26]

            sample_type = int(barcode[3][:2])
            if sample_type >= 20 or sample_type <= 0:
                continue
            sample.is_normal = 10 <= sample_type < 20
            sample.is_blood = sample_type in [
                3, 4, 9, 10
            ]  # https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm

            if any(s.description == sample.description
                   for s in samples_by_patient[sample.patient]):
                prev_sample = next(s
                                   for s in samples_by_patient[sample.patient]
                                   if s.description == sample.description)

                # comp reason
                # if 'Fileset modified' not in prev_sample.reason and 'Fileset modified' in sample.reason:
                #     err('Duplicated sample: ' + sample.description + '  Fileset modified not in old ' + prev_sample.name + ' over ' + sample.name)
                #     pass
                # elif 'Fileset modified' in prev_sample.reason and 'Fileset modified' not in sample.reason:
                #     samples_by_patient[sample.patient].remove(prev_sample)
                #     samples_by_patient[sample.patient].append(sample)
                #     err('Duplicated sample: ' + sample.description + '  Fileset modified not in new ' + sample.name + ' over ' + prev_sample.name)
                # else:
                # comp version
                prev_version = get_bam_version(prev_sample.bam_base_name)
                version = get_bam_version(sample.bam_base_name)
                err('Duplicated sample: ' + sample.description +
                    '  Resolving by version (' + ' over '.join(
                        map(str,
                            sorted([prev_version, version])[::-1])) + ')')
                if version > prev_version:
                    samples_by_patient[sample.patient].remove(prev_sample)
                    samples_by_patient[sample.patient].append(sample)
            else:
                samples_by_patient[sample.patient].append(sample)

    batches = []
    final_samples = set()

    if bina_dirpath:
        safe_mkdir(bina_dirpath)

    for patient, patient_samples in samples_by_patient.iteritems():
        tumours = [s for s in patient_samples if not s.is_normal]
        normals = [s for s in patient_samples if s.is_normal]

        main_normal = None
        if len(normals) >= 1:
            if any(n.is_blood for n in normals):
                main_normal = next(n for n in normals if n.is_blood)
            else:
                main_normal = normals[0]
                if tumours:
                    for n in normals[1:]:
                        b = Batch(n.description + '-batch')
                        b.tumour = n
                        batches.append(b)

        for t in tumours:
            b = Batch(t.description + '-batch')
            b.tumour = t
            t.batches.add(b)
            final_samples.add(t)
            if main_normal:
                b.normal = main_normal
                main_normal.batches.add(b)
                final_samples.add(main_normal)
            batches.append(b)

        ##################
        ###### Bina ######
        if bina_dirpath:
            bina_patient_dirpath = join(bina_dirpath, patient)
            safe_mkdir(bina_patient_dirpath)
            normals_csv_fpath = join(bina_patient_dirpath, 'normals.csv')
            tumours_csv_fpath = join(bina_patient_dirpath, 'tumors.csv')

            if main_normal:
                with open(normals_csv_fpath, 'w') as f:
                    f.write('name,bam\n')
                    bam_fpath = join(
                        bam_dirpath,
                        main_normal.bam) if bam_dirpath else main_normal.bam
                    f.write(main_normal.description + ',' + bam_fpath + '\n')

            with open(tumours_csv_fpath, 'w') as f:
                f.write('name,bam\n')
                for t in tumours:
                    bam_fpath = join(bam_dirpath,
                                     t.bam) if bam_dirpath else t.bam
                    f.write(t.description + ',' + bam_fpath + '\n')

    if bina_dirpath:
        err('Saved bina CSVs to ' + bina_dirpath)

    ###########################
    ######## Bcbio CSV ########
    print 'bcbio_nextgen.py -w template bcbio.yaml', out_fpath,
    with open(out_fpath, 'w') as out:
        out.write('sample,description,batch,phenotype\n')
        for s in sorted(final_samples, key=lambda s: s.bam_base_name):
            out.write(','.join([
                s.bam_base_name, s.description, ';'.join(
                    sorted(b.name for b in s.batches)),
                ('normal' if s.is_normal else 'tumor')
            ]) + '\n')
            bam_fpath = join(bam_dirpath, s.bam) if bam_dirpath else s.bam

            if verify_bam(bam_fpath, is_critical=False):
                try:
                    bam = pysam.Samfile(bam_fpath, "rb")
                except ValueError:
                    err(traceback.format_exc())
                    err('Cannot read ' + bam_fpath)
                    err()
                    # n_rgs = max(1, len(bam.header.get("RG", [])))
                else:
                    print bam_fpath,
Пример #15
0
 def __get_basecall_stats_reports(self):
     dirpath = join(self.unaligned_dirpath, 'Reports', 'html')
     index_html_fpath = join(dirpath, 'index.html')
     if verify_dir(dirpath) and verify_file(index_html_fpath):
         return [index_html_fpath]