Exemplo n.º 1
0
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            if not silent:
                critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                         f'{self.bcbio_project.final_dir}. Please check consistency between the YAML '
                         f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: '
                         f'to every "description" value in YAML, there should be a corresponding folder with the '
                         f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                         f'from consideration, if you are sure that missing folders are expected.')
            else:
                return False
        self.var_dirpath = join(self.dirpath, BcbioProject.var_dir)

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
            else:
                if not silent: warn('Counts for ' + self.name + ' not found')
        else:
            if variantcallers_data:
                self._set_variant_files(variantcallers_data, ensemble=ensemble)
            else:
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
        return True
Exemplo n.º 2
0
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.rgid = self.name
        self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                     f'{self.parent_project.final_dir}. Please check consistency between the YAML '
                     f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: '
                     f'to every "description" value in YAML, there should be a corresponding folder with the '
                     f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                     f'from consideration, if you are sure that missing folders are expected.')

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
            else:
                if not silent: warn('Counts for ' + self.name + ' not found')
        else:
            if variantcallers_data:
                self._set_variant_callers(variantcallers_data, ensemble=ensemble)
            else:
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
Exemplo n.º 3
0
def set_genomes_dir(new_genomes_dir=None):
    global genomes_dir
    if new_genomes_dir:
        # genomes_dir was provided explicitly (in paths.yaml or with --genomes-dir)
        verify_dir(new_genomes_dir, is_critical=True)
        genomes_dir = new_genomes_dir
    else:
        genomes_dir = find_genomes_dir()
    if not genomes_dir:
        critical(
            'Could not detect genomes dir. Please specify one with --genomes-dir or $UMCCRISE_GENOMES'
        )
Exemplo n.º 4
0
 def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False):
     if final_dir:
         return final_dir
     elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']:
         final_dirname = bcbio_cnf['upload']['dir']
         final_dir = adjust_path(join(config_dir, final_dirname))
         if create_dir: safe_mkdir(final_dir)
         verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True)
     else:
         final_dir = abspath(join(config_dir, pardir, 'final'))
         if create_dir: safe_mkdir(final_dir)
         if not verify_dir(final_dir):
             critical('If final directory it is not named "final", please, specify it in the bcbio config.')
     return final_dir
Exemplo n.º 5
0
 def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False):
     if final_dir:
         return final_dir
     elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']:
         final_dirname = bcbio_cnf['upload']['dir']
         final_dir = adjust_path(join(config_dir, final_dirname))
         if create_dir: safe_mkdir(final_dir)
         verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True)
     else:
         final_dir = abspath(join(config_dir, pardir, 'final'))
         if create_dir: safe_mkdir(final_dir)
         if not verify_dir(final_dir):
             critical('If final directory it is not named "final", please, specify it in the bcbio config.')
     return final_dir
Exemplo n.º 6
0
def load_data(data_dir, name, genome, reuse=False):
    print(f"reuse: {reuse}")
    data_dir = verify_dir(data_dir, is_critical=True)
    bam_files = glob.glob(join(data_dir, '*.bam'))
    assert bam_files, 'No BAM files in ' + data_dir

    if name.startswith('--name'):
        name = name.split('--name')[1]

    bed_file = None
    bed_files = glob.glob(join(data_dir, '*.bed'))
    if bed_files:
        assert len(
            bed_files
        ) == 1, 'Multuple BED files in ' + data_dir + ': ' + str(bed_files)
        bed_file = bed_files[0]

    sample_by_bam = dict()
    for bam_file in bam_files:
        # sample_by_bam[bam_file] = check_output('goleft samplename ' + bam_file)
        sample_by_bam[bam_file] = bam_samplename(bam_file)

    _add_project(bam_by_sample={sample_by_bam[bf]: bf
                                for bf in bam_files},
                 project_name=name,
                 bed_file=bed_file,
                 use_callable=not bed_file,
                 data_dir=data_dir,
                 genome=genome,
                 min_depth=DEPTH_CUTOFF,
                 reuse_files=reuse)
Exemplo n.º 7
0
    def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False):
        if not date_dir:
            fc_date = bcbio_cnf.get('fc_date')
            fc_name = bcbio_cnf.get('fc_name') or 'project'
            if fc_date:
                # Date dirpath is from bcbio and named after fc_name, not our own project name
                date_dir = join(final_dir, fc_date + '_' + fc_name)
                if not create_dir and not verify_dir(date_dir, silent=True):
                    critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}')
            else:
                if isdir(join(final_dir, 'project')):  # bcbio-CWL?
                    date_dir = join(final_dir, 'project')
                    if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir)
                else:
                    regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}']
                    date_dirs = [join(final_dir, dirpath)
                                 for dirpath in listdir(final_dir)
                                 if any(re.match(regex, dirpath) for regex in regexs)]
                    if len(date_dirs) == 0:
                        raise NoDateStampsException('Error: no datestamp directory!')
                    elif len(date_dirs) == 1:
                        date_dir = date_dirs[0]
                    else:
                        dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs]
                        newest_date, newest_dir = sorted(dates, reverse=True)[0]
                        newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir]
                        if len(newest_dirs) > 1:
                            raise MultipleDateStampsException(f'Error: multiple datestamp directory found, '
                               f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}')
                        date_dir = newest_dirs[0]

                    if not silent: info('Using the datestamp dir: ' + date_dir)
        if create_dir:
            safe_mkdir(date_dir)
        return date_dir
Exemplo n.º 8
0
    def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False):
        if not date_dir:
            fc_date = bcbio_cnf.get('fc_date')
            fc_name = bcbio_cnf.get('fc_name') or 'project'
            if fc_date:
                # Date dirpath is from bcbio and named after fc_name, not our own project name
                date_dir = join(final_dir, fc_date + '_' + fc_name)
                if not create_dir and not verify_dir(date_dir, silent=True):
                    critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}')
            else:
                if isdir(join(final_dir, 'project')):  # bcbio-CWL?
                    date_dir = join(final_dir, 'project')
                    if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir)
                else:
                    regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}']
                    date_dirs = [join(final_dir, dirpath)
                                 for dirpath in listdir(final_dir)
                                 if any(re.match(regex, dirpath) for regex in regexs)]
                    if len(date_dirs) == 0:
                        raise NoDateStampsException('Error: no datestamp directory!')
                    elif len(date_dirs) == 1:
                        date_dir = date_dirs[0]
                    else:
                        dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs]
                        newest_date, newest_dir = sorted(dates, reverse=True)[0]
                        newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir]
                        if len(newest_dirs) > 1:
                            raise MultipleDateStampsException(f'Error: multiple datestamp directory found, '
                               f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}')
                        date_dir = newest_dirs[0]

                    if not silent: info('Using the datestamp dir: ' + date_dir)
        if create_dir:
            safe_mkdir(date_dir)
        return date_dir
Exemplo n.º 9
0
 def _check_dir_not_empty(dirpath, description=None):
     assert verify_dir(dirpath, description=description), dirpath
     contents = [join(dirpath, fname) for fname in os.listdir(dirpath)
                 if not fname.startswith('.')]
     assert len(contents) >= 1, dirpath + ': ' + str(contents)
     assert all(verify_file(realpath(fpath), is_critical=True)
                for fpath in contents
                if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
Exemplo n.º 10
0
 def _check_dir_not_empty(dirpath, description=None):
     assert verify_dir(dirpath, description=description), dirpath
     contents = [
         join(dirpath, fname) for fname in os.listdir(dirpath)
         if not fname.startswith('.')
     ]
     assert len(contents) >= 1, dirpath + ': ' + str(contents)
     assert all(
         verify_file(realpath(fpath), is_critical=True)
         for fpath in contents
         if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
Exemplo n.º 11
0
def main(paths, output_dir, genome, depth):
    log.init(True)

    bed_files = [verify_file(f, is_critical=True) for f in paths if isfile(f)]

    bcbio_projs = []
    dirs = [verify_dir(f, is_critical=True) for f in paths if isdir(f)]
    if dirs:
        for d in dirs:
            proj = BcbioProject()
            proj.load_from_bcbio_dir(d, proc_name='clearup')
            bcbio_projs.append(proj)

    build_snps_panel(bcbio_projs, bed_files, safe_mkdir(output_dir), genome)
Exemplo n.º 12
0
    def __init__(self,
                 genome,
                 filt_cnf,
                 tricky_regions_dir,
                 transcripts_fpath,
                 reg_exp_sample=None,
                 platform=None):
        self.all_reject_counter = OrderedDefaultDict(int)
        self.all_counter = OrderedDefaultDict(int)
        self.gene_blacklist_counter = OrderedDefaultDict(int)
        self.region_blacklist_counter = OrderedDefaultDict(int)

        compendia_fpath = verify_file(filt_ref_data.compendia(genome),
                                      'compendia_ms7_hotspot')
        actionable_fpath = verify_file(filt_ref_data.actionable(genome),
                                       'actionable')
        filter_common_snp_fpath = verify_file(filt_ref_data.common_snp(genome),
                                              'filter_common_snp')
        filter_common_arti_fpath = verify_file(
            filt_ref_data.common_art(genome), 'filter_common_artifacts')
        splice_fpath = verify_file(filt_ref_data.splice(genome), 'splice')
        suppressors_fpath = verify_file(filt_ref_data.suppressors(),
                                        'suppressors')
        oncogenes_fpath = verify_file(filt_ref_data.oncogenes(), 'oncogenes')
        ruledir = verify_dir(filt_ref_data.ruledir(), 'ruledir')
        snpeffect_polymorph_fpath = verify_file(
            filt_ref_data.snpeffect_export_polymorphic(),
            'snpeffect_export_polymorphic')
        actionable_hotspot_fpath = verify_file(
            filt_ref_data.actionable_hotspot(), 'actionable_hotspot')
        specific_mutations_fpath = verify_file(
            filt_ref_data.specific_mutations(), 'specific_mutations')
        last_critical_aa_fpath = verify_file(filt_ref_data.last_critical_aa(),
                                             'last_critical_aa')
        incidentalome_dir = verify_dir(filt_ref_data.incidentalome_dir(),
                                       'incidentalome')
        comments_fpath = verify_file(filt_ref_data.ngs_reports_comments(),
                                     'ngs_reports_comments')
        if not all([
                compendia_fpath,
                actionable_fpath,
                filter_common_snp_fpath,
                filter_common_arti_fpath,
                splice_fpath,
                suppressors_fpath,
                oncogenes_fpath,
                ruledir,
                snpeffect_polymorph_fpath,
                actionable_hotspot_fpath,
                specific_mutations_fpath,
                last_critical_aa_fpath,
                incidentalome_dir,
                comments_fpath,
        ]):
            logger.err(
                'Error: some of the required files are not found or empty (see above)'
            )

        self.suppressors = parse_genes_list(adjust_path(suppressors_fpath))
        self.oncogenes = parse_genes_list(adjust_path(oncogenes_fpath))

        self.reg_exp_sample = reg_exp_sample
        self.platform = platform

        transcripts_fpath = verify_file(transcripts_fpath, silent=True)
        if transcripts_fpath:
            logger.info('Using canonical transcripts from ' +
                        transcripts_fpath)
            with open(transcripts_fpath) as f:
                self.transcripts = [tr.strip().split('.')[0] for tr in f]

        self.max_ratio = filt_cnf['max_ratio']
        self.max_sample_cnt = filt_cnf['max_sample_cnt']

        self.min_freq = filt_cnf['min_freq']  # for all variants
        self.act_min_freq = filt_cnf['act_min_freq']
        self.act_min_freq = self.act_min_freq or self.min_freq // 2
        self.germline_min_freq = filt_cnf['germline_min_freq']

        self.filt_depth = filt_cnf['filt_depth']
        self.min_vd = filt_cnf['min_vd']
        self.min_gmaf = filt_cnf['min_gmaf']

        self.keep_utr_intronic = filt_cnf['keep_utr_intronic']
        self.keep_whole_genome = filt_cnf['keep_whole_genome']
        self.keep_hla = filt_cnf['keep_hla']
        self.damage_p_value = filt_cnf.get('damage_p_value')

        logger.info('Parsing filtering data...')
        self.tp53_groups = {
            'Group 1': parse_mut_tp53(join(ruledir, 'DNE.txt')),
            'Group 2': parse_mut_tp53(join(ruledir, 'TA0-25.txt')),
            'Group 3': parse_mut_tp53(join(ruledir, 'TA25-50_SOM_10x.txt'))
        }

        self.splice_positions_by_gene = defaultdict(set)
        for l in iter_lines(splice_fpath):
            pos, g = l.split('\t')
            self.splice_positions_by_gene[g].add(pos)

        self.last_critical_aa_pos_by_gene = dict()
        for l in iter_lines(last_critical_aa_fpath):
            g, aa_pos, _ = l.split('\t')
            self.last_critical_aa_pos_by_gene[g] = int(aa_pos)

        self.filter_snp = set()
        for l in iter_lines(filter_common_snp_fpath):
            fields = l.split('\t')
            self.filter_snp.add('-'.join(fields[1:5]))

        self.snpeff_snp = set()
        self.snpeff_snp_rsids = set()
        for l in iter_lines(snpeffect_polymorph_fpath):
            fields = l.split('\t')
            snpeff_aachg = fields[2]
            snpeff_rsid = fields[5]
            if len(fields) > 11 and fields[11]:
                snpeff_gene = fields[11]
                self.snpeff_snp.add('-'.join([snpeff_gene, snpeff_aachg]))
            elif snpeff_rsid != '-':
                self.snpeff_snp_rsids.add(snpeff_rsid)

        self.filter_artifacts = set()
        self.filter_rules_by_gene = defaultdict(list)
        for l in iter_lines(filter_common_arti_fpath):
            fields = l.split('\t')
            if fields[5] == 'rule':
                gene, chrom, start, end, action, _, _, _, note = fields[:9]
                rule = Rule(gene,
                            chrom=chrom,
                            start=int(start),
                            end=int(end),
                            action=action,
                            note=note)
                self.filter_rules_by_gene[gene].append(rule)
            else:
                gene, chrom, start, ref, alt = fields[:5]
                self.filter_artifacts.add('-'.join([chrom, start, ref, alt]))

        self.actionable_hotspot_by_gene = defaultdict(dict)
        self.common_snps_by_gene = defaultdict(set)
        with open(actionable_hotspot_fpath) as f:
            for l in f:
                l = l.replace('\n', '')
                if not l or l.startswith('##'):
                    continue
                fields = l.split('\t')
                gene = fields[0]
                prot_change = fields[1]
                if gene.startswith('#'):  # VUS, No special treatment for now
                    gene = gene[1:]
                elif gene.startswith('^'):
                    gene = gene[1:]
                    self.common_snps_by_gene[gene].add(prot_change)
                else:
                    is_somatic = fields[2] == 'somatic'
                    self.actionable_hotspot_by_gene[gene][
                        prot_change] = 'somatic' if is_somatic else 'germline'

        self.ngs_reports_comments = defaultdict(dict)
        with open(comments_fpath) as f:
            for r in csv.DictReader(
                (row for row in f if not row.startswith('#')), delimiter='\t'):
                gene = r['Gene']
                prot_change = r['AA_Change']
                if gene.startswith('^'):
                    gene = gene[
                        1:]  # remove leading ^ character, e.g. ^EGFR -> EGFR
                    is_somatic = 'somatic' in r['Note']
                    self.actionable_hotspot_by_gene[gene][
                        prot_change] = 'somatic' if is_somatic else 'germline'
                else:
                    self.ngs_reports_comments[gene][prot_change] = r['Note']

        self.act_somatic = dict()
        self.act_germline = set()
        self.rules = defaultdict(list)
        for l in iter_lines(actionable_fpath):
            fields = l.split('\t')

            if fields[7] == 'germline':
                key = '-'.join(fields[1:5])
                self.act_germline.add(key)

            elif fields[7] == 'somatic':
                change = fields[8].strip()
                if fields[6] == 'rule':
                    if fields[4] == '*' and len(fields[3]) == 1:
                        key = '-'.join(fields[1:4])
                        self.act_somatic[key] = change
                    else:
                        indel_type = ''
                        if 'indel' in fields[5]: indel_type = 'indel'
                        elif 'ins' in fields[5]: indel_type = 'ins'
                        elif 'del' in fields[5]: indel_type = 'del'
                        rule = Rule(gene=fields[0],
                                    chrom=fields[1],
                                    start=int(fields[2]),
                                    end=int(fields[3]),
                                    length=int(fields[4]),
                                    required_inframe='inframe' in fields[5],
                                    indel_type=indel_type,
                                    change=change)
                        self.rules[rule.gene].append(rule)
                    # elif fields[5] == inframe_del:
                    #     self.rules[inframe_del].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]])
                    # elif fields[5] == inframe_ins:
                    #     self.rules[inframe_ins].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]])

                else:
                    key = '-'.join(fields[1:5])
                    self.act_somatic[key] = change

        self.hotspot_nucleotides = set()
        self.hotspot_proteins = set()
        for l in iter_lines(compendia_fpath):
            fields = l.split('\t')
            if fields[5].startswith('g.'):
                continue
            self.hotspot_nucleotides.add('-'.join(fields[1:5]))
            if not fields[6]:
                continue
            self.hotspot_proteins.add('-'.join([fields[0], fields[6]]))

        logger.info('Parsing gene blacklists...')
        anno_cfg = get_anno_config()
        self.gene_blacklists_by_reason = parse_gene_blacklists(
            anno_cfg['blacklist']['genes'], incidentalome_dir)
        for r in self.gene_blacklists_by_reason.keys():
            self.gene_blacklist_counter[r] = 0
        self.gene_blacklist_counter['hardfilter'] = 0
        # self.gene_to_soft_filter = list(iter_lines(join(incidentalome_dir, 'soft_filter.txt')))

        # self.region_blacklists_by_reason = dict()
        # if tricky_regions_dir:
        #     info('Parsing region blacklists...')
        #     self.region_blacklists_by_reason = load_tricky_regions(anno_cfg['blacklist']['regions'], tricky_regions_dir)
        #     for r in self.region_blacklists_by_reason.keys():
        #         self.region_blacklist_counter[r] = 0

        logger.info('Parsing actionable rules and specific mutations...')
        self.tier_by_specific_mutations, self.tier_by_type_by_region_by_gene, self.sensitizations_by_gene\
            = parse_specific_mutations(specific_mutations_fpath)

        if not all([
                self.rules, self.splice_positions_by_gene, self.act_somatic,
                self.act_germline, self.actionable_hotspot_by_gene
        ]):
            if not self.rules:
                logger.err('No rules, cannot proceed')
            if not self.splice_positions_by_gene:
                logger.err('No tp53_positions, cannot proceed')
            if not self.act_somatic:
                logger.err('No act_somatic, cannot proceed')
            if not self.act_germline:
                logger.err('No act_germline, cannot proceed')
            if not self.actionable_hotspot_by_gene:
                logger.err('No actionable_hotspots, cannot proceed')

        self.status = None
        self.reason_by_status = None

        self.output_f = None
        self.fm_output_f = None
        self.rejected_output_f = None