Пример #1
0
def _add_to_ngb(work_dir, project_name, bam_by_sample, genome_build, bed_file,
                p_view):
    if is_us() or is_uk():
        try:
            from az.ngb import add_bcbio_project_to_ngb, add_data_to_ngb, add_file_to_ngb
        except ImportError:
            log.warn(
                'If you want to, install NGS Reporting with `conda install -v vladsaveliev ngs_reporting`'
            )
        else:
            log.info('Exposing project to NGB...')
            try:
                dataset = project_name + '_Fingerprints'
                add_data_to_ngb(work_dir,
                                p_view,
                                bam_by_sample,
                                dict(),
                                dataset,
                                bed_file=bed_file,
                                genome=genome_build)
                add_file_to_ngb(work_dir,
                                get_dbsnp(genome_build),
                                genome_build,
                                dataset,
                                dataset,
                                skip_if_added=True)
            except Exception:
                traceback.print_exc()
                log.err('Error: cannot export to NGB')
            log.info('*' * 70)
Пример #2
0
def open_gzipsafe(f, mode='r'):
    # mode_t = mode.replace('b', '')
    # mode_b = mode if 'b' in mode else mode + 'b'
    if f.endswith('.gz') or f.endswith('.gzip') or f.endswith('.gz.tx') or f.endswith('.gzip.tx'):
        try:
            h = gzip.open(f, mode=mode + 't', encoding='UTF-8')
        except IOError as e:
            err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text')
            return open(f, mode=mode)
        else:
            if 'w' in mode:
                return h
            else:
                try:
                    h.read(1)
                except IOError as e:
                    err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text')
                    h.close()
                    return open(f, mode=mode)
                else:
                    h.close()
                    h = gzip.open(f, mode=mode + 't')
                    return h
    else:
        return open(f, mode=mode)
Пример #3
0
def open_gzipsafe(f, mode='r'):
    # mode_t = mode.replace('b', '')
    # mode_b = mode if 'b' in mode else mode + 'b'
    if f.endswith('.gz') or f.endswith('.gzip') or f.endswith('.gz.tx') or f.endswith('.gzip.tx'):
        try:
            h = gzip.open(f, mode=mode + 't', encoding='UTF-8')
        except IOError as e:
            err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text')
            return open(f, mode=mode)
        else:
            if 'w' in mode:
                return h
            else:
                try:
                    h.read(1)
                except IOError as e:
                    err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text')
                    h.close()
                    return open(f, mode=mode)
                else:
                    h.close()
                    h = gzip.open(f, mode=mode + 't')
                    return h
    else:
        return open(f, mode=mode)
Пример #4
0
def file_nonempty_check(output_fpath=None, input_fpath=None):
    if output_fpath is None:
        return True
    ok = file_exists_check(output_fpath)
    if not ok:
        err("Did not find non-empty output file {0}".format(output_fpath))
    return ok
Пример #5
0
def file_exists_check(output_fpath=None, input_fpath=None):
    if output_fpath is None:
        return True
    ok = os.path.exists(output_fpath)
    if not ok:
        err("Did not find output file {0}".format(output_fpath))
    return ok
Пример #6
0
 def classify_tp53(self, aa_chg, pos, ref, alt):
     aa_chg = aa_chg.replace(' ', '')
     if str(pos) in self.splice_positions_by_gene['TP53'] and len(
             ref) == 1 and len(alt) == 1:
         return 6
     aa_chg = aa_chg.replace('p.', '')
     aa_num = 0
     if aa_chg:
         aa_num_str = re.sub('[^0-9]', '', aa_chg)
         if not aa_num_str:
             logger.err('TP53: cannot parse aa num from aa_chg=' +
                        str(aa_chg))
         else:
             aa_num = int(aa_num_str)
     if aa_snp_chg_pattern.match(aa_chg):
         for i in [1, 2, 3]:
             if aa_chg in self.tp53_groups['Group ' + str(i)]:
                 return i
     elif stop_gain_pattern.match(aa_chg):
         if aa_num < 359:
             return 4
     elif fs_pattern.match(aa_chg):
         if aa_num < 359:
             return 5
     return None
Пример #7
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Пример #8
0
def run_prank(run_id):
    project_names = run_id.split(',')
    projects = [Project.query.filter_by(name=pn).first() for pn in project_names]
    if not projects:
        log.err('Projects ' + ', '.join(project_names) + ' not found in database')
        abort(404)
    work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names)))
    safe_mkdir(work_dirpath)
    merged_fasta_fpath = merge_fasta(projects, work_dirpath)

    prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0])
    cmdl = prank_bin + ' -d=' + merged_fasta_fpath + ' -o=' + prank_out + ' -showtree'
    log.debug('Starting prank ' + cmdl)
    proc = subprocess.Popen(cmdl.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    # lines = []
    # prev_time = time.time()
    for stdout_line in iter(proc.stdout.readline, ''):
        print stdout_line.rstrip()
        # lines.append(stdout_line)
        cur_time = time.time()
        # if cur_time - prev_time > 2:
        emit('running',
            json.dumps({
                'finished': False,
                'lines': [stdout_line.rstrip()],
            })
        )
        # lines = []
    emit('running',
        json.dumps({
            'finished': True,
            'lines': [],
        })
    )
Пример #9
0
def send_file_for_igv(fpath):
    # handle igv.js Range header which it uses to request a subset of a BAM file:
    range_header = request.headers.get('Range', None)
    if not range_header:
        return send_file(fpath)

    m = re.search('(\d+)-(\d*)', range_header)
    if not m:
        error_msg = "ERROR: unexpected range header syntax: %s" % range_header
        log.err(error_msg)
        return error_msg

    size = os.path.getsize(fpath)
    offset = int(m.group(1))
    length = int(m.group(2) or size) - offset

    with open(fpath, 'rb') as f:
        f.seek(offset)
        data = f.read(length)

    rv = Response(data,
                  206,
                  mimetype="application/octet-stream",
                  direct_passthrough=True)
    rv.headers.add(
        'Content-Range', 'bytes {0}-{1}/{2}'.format(offset,
                                                    offset + length - 1, size))

    log.info("GET range request: %s-%s %s" % (m.group(1), m.group(2), fpath))
    return rv
Пример #10
0
def file_exists_check(output_fpath=None, input_fpaths=None):
    if output_fpath is None:
        return True
    ok = os.path.exists(output_fpath)
    if not ok:
        err('Did not find output file {output_fpath}')
    return ok
Пример #11
0
def file_nonempty_check(output_fpath=None, input_fpaths=None):
    if output_fpath is None:
        return True
    ok = verify_file(output_fpath)
    if not ok:
        err(f'Did not find non-empty output file {output_fpath}')
    return ok
Пример #12
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Пример #13
0
    def offset_to_genome_coord(trx, offset):
        genomic_coord = None
        is_in_intron = None

        length = len(trx)
        offset = offset if trx.strand == '+' else length - offset
        if offset == 0 or offset == length:
            return -1, None

        assert 0 < offset < length, f'Coordinate {offset} must be above 0 and below transcript length {length}, ' \
                                    f'transcript: {trx}'
        if not trx.exons:
            logger.err(f'  No exons for transcript {trx.id}')
            return None, None

        offset_remain = offset
        # print('  looking for coord', coord, f', in {len(transcript.exons)} exons, total length {length}')
        exons = trx.exons
        if trx.strand == '-':
            exons = reversed(exons)
        for exon in exons:
            assert offset_remain > 0
            # print('    exon len=', len(exon))
            # print('    offset_remain=', offset_remain)
            next_offset_remain = offset_remain - len(exon)
            if next_offset_remain <= 0:
                # print('    returning exon.start + offset_remain = ', exon.start + offset_remain)
                genomic_coord = exon.start - 1 + offset_remain  # -1 to convert from 1-based to 0-based
                is_in_intron = next_offset_remain == 0
                break
            offset_remain = next_offset_remain
        assert genomic_coord is not None  # correct code should always produce something
        return genomic_coord, is_in_intron
Пример #14
0
def detect_bcbio_dir(input_dir, silent=False):
    """
    :param input_dir: `config` dir, or `final` dir, or datestamp dir, or the directory root to `final`
    :return: (config_dir, final_dir, date_dir)
    """
    config_dir, final_dir, date_dir = None, None, None

    input_dir = abspath(input_dir)

    # We are inside `*final*`
    if 'final' in basename(input_dir):  # allow prefixes and postfixes
        final_dir = input_dir
        root_dir = dirname(final_dir)
        config_dir = join(root_dir, 'config')
        if not isdir(config_dir):
            err(f'Are you running on a bcbio output?\n'
                f'The input folder appear to be `final` ({input_dir}), '
                f'however can\'t find `config` directory at the same level ({config_dir})')
            raise NoConfigDirException('No config dir')

    # We are inside `config`
    elif basename(input_dir) == 'config':
        config_dir = input_dir

    # We are in a parent dir to `config` (and possibly `final`, called otherwise)
    elif isdir(join(input_dir, 'config')):
        config_dir = join(input_dir, 'config')

    # We are inside a date dir
    elif isdir(abspath(join(input_dir, pardir, pardir, 'config'))):
        final_dir = abspath(join(input_dir, pardir))
        root_dir = abspath(join(input_dir, pardir, pardir))
        config_dir = abspath(join(root_dir, 'config'))

        # if 'final' not in basename(final_dir):
        #     err(f'Are you running on a bcbio output?\n'
        #         f'Found config directory 2 level up at {config_dir}, assuming your input {input_dir} '
        #         f'is a datestamp directory. However, the parent directory is not called `*final*`')
        #     raise NoConfigDirException('No final dir')

    else:
        if not silent:
            err(f'Are you running on a bcbio output?\n'
                f'{input_dir} is not `config` or `*final*`, and '
                f'can\'t find a `config` directory at {join(input_dir, "config")}, '
                f'or {abspath(join(input_dir, pardir, "config"))}.'
                f'Make sure that you changed to a bcbio root or final directory, '
                f'or provided it as a first argument.')
        raise NoConfigDirException('No config dir')

    if not silent:
        if not silent:
            info(f'Bcbio config directory: ' + config_dir)
        if final_dir:
            if not silent: info('"final" directory: ' + final_dir)
            if date_dir:
                if not silent: info('"datestamp" directory: ' + date_dir)

    return config_dir, final_dir, date_dir
Пример #15
0
def count_bed_cols(bed_fpath):
    with open(bed_fpath) as f:
        for l in f:
            if l and l.strip() and not l.startswith('#'):
                return len(l.split('\t'))
    # return len(next(dropwhile(lambda x: x.strip().startswith('#'), open(bed_fpath))).split('\t'))
    err('Empty bed file: ' + bed_fpath)
    return None
Пример #16
0
def count_bed_cols(bed_fpath):
    with open(bed_fpath) as f:
        for l in f:
            if l and l.strip() and not l.startswith('#'):
                return len(l.split('\t'))
    # return len(next(dropwhile(lambda x: x.strip().startswith('#'), open(bed_fpath))).split('\t'))
    err('Empty bed file: ' + bed_fpath)
    return None
Пример #17
0
def detect_bcbio_dir(input_dir, silent=False):
    """
    :param input_dir: `config` dir, or `final` dir, or datestamp dir, or the directory root to `final`
    :return: (config_dir, final_dir, date_dir)
    """
    config_dir, final_dir, date_dir = None, None, None

    input_dir = abspath(input_dir)

    # We are inside `*final*`
    if 'final' in basename(input_dir):  # allow prefixes and postfixes
        final_dir = input_dir
        root_dir = dirname(final_dir)
        config_dir = join(root_dir, 'config')
        if not isdir(config_dir):
            err(f'Are you running on a bcbio output?\n'
                f'The input folder appear to be `final` ({input_dir}), '
                f'however can\'t find `config` directory at the same level ({config_dir})')
            raise NoConfigDirException('No config dir')

    # We are inside `config`
    elif basename(input_dir) == 'config':
        config_dir = input_dir

    # We are in a parent dir to `config` (and possibly `final`, called otherwise)
    elif isdir(join(input_dir, 'config')):
        config_dir = join(input_dir, 'config')

    # We are inside a date dir
    elif isdir(abspath(join(input_dir, pardir, pardir, 'config'))):
        final_dir = abspath(join(input_dir, pardir))
        root_dir = abspath(join(input_dir, pardir, pardir))
        config_dir = abspath(join(root_dir, 'config'))

        # if 'final' not in basename(final_dir):
        #     err(f'Are you running on a bcbio output?\n'
        #         f'Found config directory 2 level up at {config_dir}, assuming your input {input_dir} '
        #         f'is a datestamp directory. However, the parent directory is not called `*final*`')
        #     raise NoConfigDirException('No final dir')

    else:
        if not silent:
            err(f'Are you running on a bcbio output?\n'
                f'{input_dir} is not `config` or `*final*`, and '
                f'can\'t find a `config` directory at {join(input_dir, "config")}, or {abspath(join(input_dir, pardir, "config"))}.'
                f'Make sure that you changed to a bcbio root or final directory, or provided it as a first argument.')
        raise NoConfigDirException('No config dir')

    if not silent:
        if not silent:
            info(f'Bcbio config directory: ' + config_dir)
        if final_dir:
            if not silent: info('"final" directory: ' + final_dir)
            if date_dir:
                if not silent: info('"datestamp" directory: ' + date_dir)

    return config_dir, final_dir, date_dir
Пример #18
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = _load_yaml(fpath)
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Пример #19
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Пример #20
0
 def find_in_log(self, fname, is_critical=False, silent=True):
     options = [join(self.log_dir, fname),
                join(self.date_dir, fname)]
     for fpath in options:
         if isfile(fpath):
             return fpath
     if is_critical:
         critical('Log file not found as ' + ', '.join(options))
     elif not silent:
         err('Log file not found as ' + ', '.join(options))
Пример #21
0
 def find_in_log(self, fname, is_critical=False, silent=True):
     options = [join(self.log_dir, fname),
                join(self.date_dir, fname)]
     for fpath in options:
         if isfile(fpath):
             return fpath
     if is_critical:
         critical('Log file not found as ' + ', '.join(options))
     elif not silent:
         err('Log file not found as ' + ', '.join(options))
Пример #22
0
def add_user_call(run_id, prev_sample_id, edit_sample_id, snp_index):
    sample = Sample.query.filter_by(id=edit_sample_id).first()
    if not sample:
        log.err('Sample not found')
        return redirect('/' + run_id + '/' + prev_sample_id)

    fingerprint = sample.fingerprints.filter_by(index=snp_index).first()
    fingerprint.usercall = request.form['usercall']
    db.session.commit()
    return redirect('/' + run_id + '/' + prev_sample_id)
Пример #23
0
def compare(fp1, fp2):
    try:
        res = scipy.stats.spearmanr(fp1.flatten(), fp2.flatten())
    except ValueError as e:
        log.err(e)
        log.err('Error calculating correlation between fingerpirnts, '
                'likely too small numbrer of mutations. Try encreasing target '
                'size or filtering criteria, or decrease L.')
        return None, None
    else:
        return res.correlation, res.pvalue
Пример #24
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Пример #25
0
 def run(self, fn, param_lists):
     if self.n_samples == 0:
         return []
     assert self.n_samples == len(param_lists)
     n_params = len(param_lists[0])
     for sample_i, params in enumerate(param_lists):
         if params is None:
             err('Parameter list for sample ' + str(sample_i) + ' is None')
         if len(params) != n_params:
             err('Parameter list for sample ' + str(sample_i) + ' (' + str(len(params)) +
                 ') does not equal to the one for sample 1 (' + str(n_params) + ')')
     res = self._view.view.map(fn, *([params[param_i] for params in param_lists] for param_i in range(n_params)))
     return res
Пример #26
0
 def get_metric(self, names):
     if isinstance(names, str):
         names = [names]
     if not self.sample_info or not self.sample_info.get('metrics'):
         return None
     metrics = self.sample_info['metrics']
     val = None
     for k in metrics:
         if k.lower() in [n.lower() for n in names] and metrics[k] != 'NA':
             val = metrics[k]
     if val is None:
         err('Cannot find ' + ', '.join(names) + ' in metrics for ' + self.name)
     return val
Пример #27
0
 def get_metric(self, names):
     if isinstance(names, str):
         names = [names]
     if not self.sample_info or not self.sample_info.get('metrics'):
         return None
     metrics = self.sample_info['metrics']
     val = None
     for k in metrics:
         if k.lower() in [n.lower() for n in names] and metrics[k] != 'NA':
             val = metrics[k]
     if val is None:
         err('Cannot find ' + ', '.join(names) + ' in metrics for ' + self.name)
     return val
Пример #28
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(
        **locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Пример #29
0
def server_error(error):
    log.err('Error: ' + str(error))
    log.err(traceback.format_exc())

    lines = []
    for l in traceback.format_exc().split('\n'):
        if l.strip():
            lines.append(l.replace('    ', '&nbsp;' * 4))

    return render_template(
        'error.html',
        title='Internal Server Error',
        error='Error: ' + str(error) + '',
        traceback=traceback.format_exc().split('\n')), \
        500
Пример #30
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' +
                         l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' +
                         r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Пример #31
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Пример #32
0
 def run(self, fn, param_lists):
     if self.n_samples == 0:
         return []
     assert self.n_samples == len(param_lists)
     n_params = len(param_lists[0])
     for sample_i, params in enumerate(param_lists):
         if params is None:
             err('Parameter list for sample ' + str(sample_i) + ' is None')
         if len(params) != n_params:
             err('Parameter list for sample ' + str(sample_i) + ' (' +
                 str(len(params)) +
                 ') does not equal to the one for sample 1 (' +
                 str(n_params) + ')')
     res = self._view.view.map(
         fn,
         *([params[param_i] for params in param_lists]
           for param_i in range(n_params)))
     return res
Пример #33
0
def setup_tibanna(tibanna_id=None, buckets=None):
    try:
        subprocess.check_call(f'tibanna --version', shell=True)
    except subprocess.CalledProcessError:
        logger.err('Error: tibanna is not installed. Please run `pip install -S tibanna`')
        sys.exit(1)

    if not tibanna_id:
        tibanna_id = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) 
                             for _ in range(8))
        assert not check_tibanna_id_exists(tibanna_id), 'Random tibanna ID already exists: ' + tibanna_id

    step_func_name = f'tibanna_unicorn_{tibanna_id}'
    if not check_tibanna_id_exists(tibanna_id):
        buckets_str = '' if not buckets else ('-b ' + ','.join(buckets))
        run_simple(f'tibanna deploy_unicorn -g {step_func_name} {buckets_str} --no-setenv')

    return step_func_name
Пример #34
0
def file_reasonable_size(output_fpath, input_fpath):
    ok = file_exists_check(output_fpath)
    if not ok:
        return ok
    # named pipes -- we can't calculate size
    if input_fpath.strip().startswith("<("):
        return True
    if input_fpath.endswith((".bam", ".gz")):
        scale = 7.0
    else:
        scale = 10.0
    orig_size = os.path.getsize(input_fpath) / pow(1024.0, 3)
    out_size = os.path.getsize(output_fpath) / pow(1024.0, 3)
    if out_size < (orig_size / scale):
        err("Output file unexpectedly small. %.1fGb for output versus "
            "%.1fGb for the input file. This often indicates a truncated "
            "BAM file or memory errors during the run." % (out_size, orig_size))
        return False
    else:
        return True
Пример #35
0
def file_reasonable_size(output_fpath, input_fpaths):
    ok = file_nonempty_check(output_fpath)
    if not ok:
        return ok
    # named pipes -- we can't calculate size
    if input_fpaths[0].strip().startswith("<("):
        return True
    if input_fpaths[0].endswith((".bam", ".gz")):
        scale = 7.0
    else:
        scale = 10.0
    orig_size = os.path.getsize(input_fpaths[0]) / pow(1024.0, 3)
    out_size = os.path.getsize(output_fpath) / pow(1024.0, 3)
    if out_size < (orig_size / scale):
        err(f'Output file unexpectedly small. {out_size}Gb for output versus '
            f'{orig_size}Gb for the input file. This often indicates a truncated '
            'BAM file or memory errors during the run.')
        return False
    else:
        return True
Пример #36
0
def phylo_tree_page(run_id):
    project_names = run_id.split(',')
    projects = [Project.query.filter_by(name=pn).first() for pn in project_names]
    if not projects:
        log.err('Projects ' + ', '.join(project_names) + ' not found in database')
        abort(404)
    color_by_proj = {p.name: PROJ_COLORS[i % len(PROJ_COLORS)] for i, p in enumerate(projects)}
    work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names)))
    safe_mkdir(work_dirpath)
    merged_fasta_fpath = merge_fasta(projects, work_dirpath)

    prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0])
    tree_fpath = os.path.join(prank_out + '.best.dnd')
    if not can_reuse(tree_fpath, merged_fasta_fpath):
        return render_template(
            'processing.html',
            projects=[{
                'name': p.name,
            } for i, p in enumerate(projects)],
            run_id=run_id,
            title='Processing ' + ', '.join(project_names),
        )

    log.debug('Prank results found, rendering tree!')
    tree = next(Phylo.parse(tree_fpath, 'newick'))
    seq_by_id = read_fasta(merged_fasta_fpath)
    tree_json = tree_to_json_for_d3(tree, seq_by_id, color_by_proj, run_id=run_id)

    all_samples_count = sum(len(p.samples.all()) for p in projects)
    return render_template(
        'tree.html',
        projects=[{
            'name': p.name,
            'color': color_by_proj[p.name],
        } for i, p in enumerate(projects)],
        title=', '.join(project_names),
        data=tree_json,
        tree_height=20 * all_samples_count,
        tree_width=5 * all_samples_count,
    )
    def _get_approved_genes_by_kind(approved_genes, kind):
        if not approved_genes:
            return 'NOT FOUND'

        if len(approved_genes) > 1:
            approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id]

            if len(approved_genes_same_ucsc) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' +
                    db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_ucsc) == 1:
                if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom):
                    err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id,
                        print_date=False)
                    return approved_genes_same_ucsc[0].name

            # Ok, no genes with same ucsc id, or not the same chromosome for them.

            approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom]

            if len(approved_genes_same_chrom) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' +
                    db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_chrom) == 1:
                g = approved_genes_same_chrom[0]
                info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom '
                    + db_chrom + ', picking it', print_date=False)
                if _check_gene_symbol(g, gene_symbol, db_id, db_chrom):
                    return g.name
                else:
                    return 'NOT FOUND'

            if len(approved_genes_same_chrom) == 0:
                err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom '
                    + db_chrom + '', print_date=False)
                return 'NOT FOUND'

        if len(approved_genes) == 1:
            if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom):
                info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as '
                    + kind + ')', print_date=False)
                return approved_genes[0].name

        return 'NOT FOUND'
def _proc_ensembl_gtf(inp, out, chr_order, additional_feature_list=None):
    if additional_feature_list is None:
        additional_feature_list = []

    info('additional_feature_list = ' + str(additional_feature_list))

    gene_by_name = OrderedDict()
    gene_by_id = OrderedDict()

    info('Parsing Ensembl input...')
    total_lines = 0
    total_non_coding_genes = 0

    for l in inp:
        if l and not l.startswith('#'):
            chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t')

            # if is_local():
            #     if chrom != '21':
            #         continue

            total_lines += 1
            if total_lines % 1000 == 0:
                info(str(total_lines / 1000) + 'k lines, ' + str(len(gene_by_name)) + ' genes found')
                sys.stdout.flush()

            try:
                _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:]))
                                  for t in props_line.split(';') if t.strip())
            except ValueError:
                sys.stderr.write(format_exc())
                sys.stderr.write(l)

            gene_symbol = _rm_quotes(_prop_dict['gene_name'])
            gene_id = _rm_quotes(_prop_dict['gene_id'])
            gene_biotype = _rm_quotes(_prop_dict['gene_biotype'])
            gene_source = _rm_quotes(_prop_dict['gene_source'])

            # if gene_symbol == 'PTENP1':
            #     sys.stderr.write('PTENP1\n')

            if not ALL_EXONS and gene_biotype not in [
                'protein_coding',
                'nonsense_mediated_decay',
                'non_stop_decay',
                'processed_transcript',
                'polymorphic_pseudogene',
                'sense_intronic',
                'sense_overlapping',
                'antisense',

            ] and not any(b in gene_biotype for b in ['RNA', 'IG_', 'TR_']):
                total_non_coding_genes += 1
                continue

            full_feature_list = ['gene', 'CDS', 'stop_codon', 'exon'] + additional_feature_list
            if ALL_EXONS:
                full_feature_list = ['gene', 'exon']
            # sys.stderr.write('Full feature list: ' + str(full_feature_list) + '\n')
            if feature not in full_feature_list:
                continue

            start, end = int(start) - 1, int(end)

            if int(end) <= int(start):
                info('Error: start > end: ' + l)
                continue

            chrom = parse_ensembl_chrom(chrom)
            if not chrom:
                continue

            if feature == 'gene':
                # assert gene_biotype == biotype, 'Gene: gene_biotype "' + gene_biotype + '"
                # do not match biotype "' + biotype + '" for ' + gene_symbol

                gene = Gene(chrom, chr_order.get(chrom), start, end, gene_symbol, strand,
                            gene_biotype, gene_id, gene_source)

                if gene.name in gene_by_name:
                    prev_gene = gene_by_name[gene.name]

                    if gene.source != prev_gene.source:
                        err('    Duplicated gene in different databases:')
                        err('        This: ' + gene.__repr__())
                        err('        Prev: ' + prev_gene.__repr__())
                        # answer = raw_input('Which one to pick? This (1), prev (2), longest (Enter): ')
                        #
                        # if answer == '1' or answer == '' and gene.end - gene.start >
                        # prev_gene.end - prev_gene.start:
                        #     del gene_by_name[prev_gene.name]
                        #     del gene_by_id[prev_gene.db_id]
                        #
                        # else:
                        #     continue

                        if gene.source == 'ensembl' or prev_gene.source == 'havana':
                            del gene_by_name[prev_gene.name]
                            del gene_by_id[prev_gene.db_id]
                            err('        Picking up this one.')

                        if prev_gene.source == 'ensembl' or gene.source == 'havana':
                            err('        Picking up previous one.')
                            continue

                    else:
                        err('    Duplicated gene in ' + gene.source + ':')
                        err('        ' + gene.__repr__())
                        prev_gene.start = min(prev_gene.start, gene.start)
                        prev_gene.end = max(prev_gene.end, gene.end)
                        prev_gene.feature = 'Multi_Gene'
                        continue

                    err('')

                gene_by_name[gene_symbol] = gene
                gene_by_id[gene_id] = gene

            elif feature in ['CDS', 'stop_codon'] \
                    or feature == 'exon' and ('RNA' in gene_biotype or ALL_EXONS) \
                    or feature in additional_feature_list:
                assert gene_symbol in gene_by_name, 'Error: ' + feature + ' record before gene record ' + \
                        gene_symbol + ', ' + gene_id + '; gene_by_name: ' + str(gene_by_name.keys())
                gene = gene_by_name[gene_symbol]
                if gene.gene_id == gene_id:
                    assert gene_biotype == gene.biotype, feature + ': gene_biotype "' + gene_biotype + \
                         '" do not match biotype "' + gene.biotype + '" for ' + gene_symbol
                    exon = Exon(gene, start, end, gene_biotype, feature)
                    gene.exons.append(exon)

    info()
    info(
        'Processed ' +
        str(total_lines) + ' lines, ' +
        str(total_non_coding_genes) + ' non-coding genes skipped, ' +
        str(len(gene_by_name)) + ' coding genes found')
    info()
    return gene_by_name
Пример #39
0
def _send_line(ws, line, error=False):
    if error:
        log.err(line.rstrip())
    else:
        log.debug(line.rstrip())
    ws.send(json.dumps({'line': line.rstrip(), 'error': error}))
def choose_canonical(genes, canonical_transcripts_ids):
    not_found_in_canon_coding_num = 0
    not_found_in_canon_coding_num_one_transcript = 0
    not_found_in_canon_rna_num = 0
    not_found_in_canon_other_num = 0
    many_canon_coding_num = 0
    many_canon_rna_num = 0
    many_canon_other_num = 0

    canon_genes = []
    for g in genes:
        _canon_tx = []
        for t in g.transcripts:
            if t.transcript_id in canonical_transcripts_ids:
                t.is_canonical = True
                _canon_tx.append(t)

        if len(_canon_tx) > 1:
            if any(t.coding for t in g.transcripts):
                many_canon_coding_num += 1
                # Checking overlapping
                for i, t1 in enumerate(_canon_tx):
                    for j in range(i + 1, len(_canon_tx)):
                        t2 = _canon_tx[j]
                        if t1.start <= t2.start < t1.end or t1.start <= t2.end < t1.end:
                            err('Transcripts ' + t1.transcript_id + ' (' + str(t1.start) + ':' + str(t1.end) + ') and ' +
                                                 t2.transcript_id + ' (' + str(t2.start) + ':' + str(t2.end) + ') ' +
                                ' in gene ' + g.name + ' ' + g.chrom + ' overlap')
            elif any(not t.coding for t in g.transcripts):
                many_canon_rna_num += 1
            else:
                many_canon_other_num += 1

        if len(_canon_tx) == 0:
            if any(t.coding for t in g.transcripts):
                not_found_in_canon_coding_num += 1
                if len(g.transcripts) == 1:
                    not_found_in_canon_coding_num_one_transcript += 1
                # longest_t = max(g.transcripts, key=Transcript.length)
                # longest_t.is_canonical = True
            elif any(not t.coding for t in g.transcripts):
                not_found_in_canon_rna_num += 1
            else:
                not_found_in_canon_other_num += 1

        g.canonical_transcripts = [t for t in g.transcripts if t.is_canonical]
        if len(g.canonical_transcripts) > 0:
            if g.canonical_transcripts:
                canon_genes.append(g)

    info('Coding genes with canonical transcripts: ' +
         str(sum(1 for g in canon_genes if any(t.coding for t in g.canonical_transcripts))))
    info('Coding canonical transcripts: ' +
         str(sum(1 for g in canon_genes for t in g.canonical_transcripts if t.coding)))
    info('RNA genes with canonical transcripts: ' +
         str(sum(1 for g in canon_genes if any(not t.coding for t in g.canonical_transcripts))))
    info('RNA canonical transcripts: ' +
         str(sum(1 for g in canon_genes for t in g.canonical_transcripts if not t.coding)))

    info()
    info('Coding genes with no canonical transcripts (picking longest out of the rest): ' + str(not_found_in_canon_coding_num))
    info('RNA genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_rna_num))
    info('Other genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_other_num))
    info('Coding genes with many canonical transcripts (picking longest): ' + str(many_canon_coding_num))
    info('RNA genes with many canonical transcripts (keeping all): ' + str(many_canon_rna_num))
    info('Other genes with many canonical transcripts (keeping all): ' + str(many_canon_other_num))

    return canon_genes
Пример #41
0
def render_closest_comparison_page(project_names_line,
                                   sample_id,
                                   selected_idx=None,
                                   rerun_if_usercall=True):
    run = Run.find_by_project_names_line(project_names_line)
    if not Run.is_ready(run) or (run.rerun_on_usercall and rerun_if_usercall):
        return run_processing(project_names_line,
                              redirect_to=url_for(
                                  'closest_comparison_page',
                                  project_names_line=project_names_line,
                                  sample_id=sample_id))

    run = Run.find_by_project_names_line(project_names_line)
    if not run:
        log.err('Run ' + str(project_names_line) + ' not found')
        abort(
            404, {
                'message':
                'Phylogenetic comparison for ' + str(project_names_line) +
                ' is not found'
            })
    sample = Sample.query.get(sample_id)
    if not sample:
        log.err('Sample ' + sample_id + ' not found in ' +
                str(project_names_line))
        abort(
            404, {
                'message':
                'Sample ' + sample_id + ' not found in ' +
                str(project_names_line)
            })
    matching_sample = _find_closest_match(sample, run)
    if not matching_sample:
        log.err('No matching sample for ' + sample.long_name())
        abort(404, {'message': 'No matching sample for ' + sample.long_name()})
    snps_dict = defaultdict(int)
    snp_tables = []
    snp_records = []
    snps_a_by_rsid = sample.snps_from_run(run)
    snps_b_by_rsid = matching_sample.snps_from_run(run)
    ngb_link_tmpl, ngb_link = None, None
    # if is_us() or is_uk():
    #     ngb_link_tmpl = get_ngb_link_template(
    #         run.work_dir_path(), sample.name, sample.project.genome, sample.project.name,
    #         sample.project.bed_fpath, matching_sample.name, matching_sample.bam)
    for i, l in enumerate(run.locations):
        snp_a = snps_a_by_rsid[l.rsid]
        snp_b = snps_b_by_rsid[l.rsid]
        # ngb_link = get_ngb_link(run.work_dir_path(), ngb_link_tmpl, snp_a.chrom, snp_a.pos) if ngb_link_tmpl else None
        snp_records.append(
            _get_snp_record(snps_dict, snp_a, snp_b, i + 1, ngb_link=ngb_link))
        if (i + 1) % SNPS_IN_ROW == 0:
            snp_tables.append(snp_records)
            snp_records = []
    if snp_records:
        snp_tables.append(snp_records)

    snps_dict['total_score'] = sum(
        (rec['score']) for recs in snp_tables for rec in recs)
    bam_fpath_a = '/%s/bams/%s' % (run.id, sample.long_name() + '.bam')
    bam_fpath_b = '/%s/bams/%s' % (run.id,
                                   matching_sample.long_name() + '.bam')
    snps_bed = '/%s/snps_bed' % project_names_line
    sample_a = {
        'id': sample.id,
        'name': sample.name,
        'project': sample.project.name,
        'bam': bam_fpath_a,
    }
    sample_b = {
        'id': matching_sample.id,
        'name': matching_sample.name,
        'project': matching_sample.project.name,
        'bam': bam_fpath_b,
    }
    t = render_template(
        'sample.html',
        project_names_line=project_names_line,
        genome=sample.project.genome,
        sampleA=sample_a,
        sampleB=sample_b,
        snps_data=snps_dict,
        snp_tables=snp_tables,
        snps_bed=snps_bed,
        selected_idx=selected_idx or "null",
        total_snps=sum([len(snps) for snps in snp_tables]),
        snps_in_row=SNPS_IN_ROW,
    )
    return t
Пример #42
0
    def __init__(self,
                 genome,
                 filt_cnf,
                 tricky_regions_dir,
                 transcripts_fpath,
                 reg_exp_sample=None,
                 platform=None):
        self.all_reject_counter = OrderedDefaultDict(int)
        self.all_counter = OrderedDefaultDict(int)
        self.gene_blacklist_counter = OrderedDefaultDict(int)
        self.region_blacklist_counter = OrderedDefaultDict(int)

        compendia_fpath = verify_file(filt_ref_data.compendia(genome),
                                      'compendia_ms7_hotspot')
        actionable_fpath = verify_file(filt_ref_data.actionable(genome),
                                       'actionable')
        filter_common_snp_fpath = verify_file(filt_ref_data.common_snp(genome),
                                              'filter_common_snp')
        filter_common_arti_fpath = verify_file(
            filt_ref_data.common_art(genome), 'filter_common_artifacts')
        splice_fpath = verify_file(filt_ref_data.splice(genome), 'splice')
        suppressors_fpath = verify_file(filt_ref_data.suppressors(),
                                        'suppressors')
        oncogenes_fpath = verify_file(filt_ref_data.oncogenes(), 'oncogenes')
        ruledir = verify_dir(filt_ref_data.ruledir(), 'ruledir')
        snpeffect_polymorph_fpath = verify_file(
            filt_ref_data.snpeffect_export_polymorphic(),
            'snpeffect_export_polymorphic')
        actionable_hotspot_fpath = verify_file(
            filt_ref_data.actionable_hotspot(), 'actionable_hotspot')
        specific_mutations_fpath = verify_file(
            filt_ref_data.specific_mutations(), 'specific_mutations')
        last_critical_aa_fpath = verify_file(filt_ref_data.last_critical_aa(),
                                             'last_critical_aa')
        incidentalome_dir = verify_dir(filt_ref_data.incidentalome_dir(),
                                       'incidentalome')
        comments_fpath = verify_file(filt_ref_data.ngs_reports_comments(),
                                     'ngs_reports_comments')
        if not all([
                compendia_fpath,
                actionable_fpath,
                filter_common_snp_fpath,
                filter_common_arti_fpath,
                splice_fpath,
                suppressors_fpath,
                oncogenes_fpath,
                ruledir,
                snpeffect_polymorph_fpath,
                actionable_hotspot_fpath,
                specific_mutations_fpath,
                last_critical_aa_fpath,
                incidentalome_dir,
                comments_fpath,
        ]):
            logger.err(
                'Error: some of the required files are not found or empty (see above)'
            )

        self.suppressors = parse_genes_list(adjust_path(suppressors_fpath))
        self.oncogenes = parse_genes_list(adjust_path(oncogenes_fpath))

        self.reg_exp_sample = reg_exp_sample
        self.platform = platform

        transcripts_fpath = verify_file(transcripts_fpath, silent=True)
        if transcripts_fpath:
            logger.info('Using canonical transcripts from ' +
                        transcripts_fpath)
            with open(transcripts_fpath) as f:
                self.transcripts = [tr.strip().split('.')[0] for tr in f]

        self.max_ratio = filt_cnf['max_ratio']
        self.max_sample_cnt = filt_cnf['max_sample_cnt']

        self.min_freq = filt_cnf['min_freq']  # for all variants
        self.act_min_freq = filt_cnf['act_min_freq']
        self.act_min_freq = self.act_min_freq or self.min_freq // 2
        self.germline_min_freq = filt_cnf['germline_min_freq']

        self.filt_depth = filt_cnf['filt_depth']
        self.min_vd = filt_cnf['min_vd']
        self.min_gmaf = filt_cnf['min_gmaf']

        self.keep_utr_intronic = filt_cnf['keep_utr_intronic']
        self.keep_whole_genome = filt_cnf['keep_whole_genome']
        self.keep_hla = filt_cnf['keep_hla']
        self.damage_p_value = filt_cnf.get('damage_p_value')

        logger.info('Parsing filtering data...')
        self.tp53_groups = {
            'Group 1': parse_mut_tp53(join(ruledir, 'DNE.txt')),
            'Group 2': parse_mut_tp53(join(ruledir, 'TA0-25.txt')),
            'Group 3': parse_mut_tp53(join(ruledir, 'TA25-50_SOM_10x.txt'))
        }

        self.splice_positions_by_gene = defaultdict(set)
        for l in iter_lines(splice_fpath):
            pos, g = l.split('\t')
            self.splice_positions_by_gene[g].add(pos)

        self.last_critical_aa_pos_by_gene = dict()
        for l in iter_lines(last_critical_aa_fpath):
            g, aa_pos, _ = l.split('\t')
            self.last_critical_aa_pos_by_gene[g] = int(aa_pos)

        self.filter_snp = set()
        for l in iter_lines(filter_common_snp_fpath):
            fields = l.split('\t')
            self.filter_snp.add('-'.join(fields[1:5]))

        self.snpeff_snp = set()
        self.snpeff_snp_rsids = set()
        for l in iter_lines(snpeffect_polymorph_fpath):
            fields = l.split('\t')
            snpeff_aachg = fields[2]
            snpeff_rsid = fields[5]
            if len(fields) > 11 and fields[11]:
                snpeff_gene = fields[11]
                self.snpeff_snp.add('-'.join([snpeff_gene, snpeff_aachg]))
            elif snpeff_rsid != '-':
                self.snpeff_snp_rsids.add(snpeff_rsid)

        self.filter_artifacts = set()
        self.filter_rules_by_gene = defaultdict(list)
        for l in iter_lines(filter_common_arti_fpath):
            fields = l.split('\t')
            if fields[5] == 'rule':
                gene, chrom, start, end, action, _, _, _, note = fields[:9]
                rule = Rule(gene,
                            chrom=chrom,
                            start=int(start),
                            end=int(end),
                            action=action,
                            note=note)
                self.filter_rules_by_gene[gene].append(rule)
            else:
                gene, chrom, start, ref, alt = fields[:5]
                self.filter_artifacts.add('-'.join([chrom, start, ref, alt]))

        self.actionable_hotspot_by_gene = defaultdict(dict)
        self.common_snps_by_gene = defaultdict(set)
        with open(actionable_hotspot_fpath) as f:
            for l in f:
                l = l.replace('\n', '')
                if not l or l.startswith('##'):
                    continue
                fields = l.split('\t')
                gene = fields[0]
                prot_change = fields[1]
                if gene.startswith('#'):  # VUS, No special treatment for now
                    gene = gene[1:]
                elif gene.startswith('^'):
                    gene = gene[1:]
                    self.common_snps_by_gene[gene].add(prot_change)
                else:
                    is_somatic = fields[2] == 'somatic'
                    self.actionable_hotspot_by_gene[gene][
                        prot_change] = 'somatic' if is_somatic else 'germline'

        self.ngs_reports_comments = defaultdict(dict)
        with open(comments_fpath) as f:
            for r in csv.DictReader(
                (row for row in f if not row.startswith('#')), delimiter='\t'):
                gene = r['Gene']
                prot_change = r['AA_Change']
                if gene.startswith('^'):
                    gene = gene[
                        1:]  # remove leading ^ character, e.g. ^EGFR -> EGFR
                    is_somatic = 'somatic' in r['Note']
                    self.actionable_hotspot_by_gene[gene][
                        prot_change] = 'somatic' if is_somatic else 'germline'
                else:
                    self.ngs_reports_comments[gene][prot_change] = r['Note']

        self.act_somatic = dict()
        self.act_germline = set()
        self.rules = defaultdict(list)
        for l in iter_lines(actionable_fpath):
            fields = l.split('\t')

            if fields[7] == 'germline':
                key = '-'.join(fields[1:5])
                self.act_germline.add(key)

            elif fields[7] == 'somatic':
                change = fields[8].strip()
                if fields[6] == 'rule':
                    if fields[4] == '*' and len(fields[3]) == 1:
                        key = '-'.join(fields[1:4])
                        self.act_somatic[key] = change
                    else:
                        indel_type = ''
                        if 'indel' in fields[5]: indel_type = 'indel'
                        elif 'ins' in fields[5]: indel_type = 'ins'
                        elif 'del' in fields[5]: indel_type = 'del'
                        rule = Rule(gene=fields[0],
                                    chrom=fields[1],
                                    start=int(fields[2]),
                                    end=int(fields[3]),
                                    length=int(fields[4]),
                                    required_inframe='inframe' in fields[5],
                                    indel_type=indel_type,
                                    change=change)
                        self.rules[rule.gene].append(rule)
                    # elif fields[5] == inframe_del:
                    #     self.rules[inframe_del].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]])
                    # elif fields[5] == inframe_ins:
                    #     self.rules[inframe_ins].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]])

                else:
                    key = '-'.join(fields[1:5])
                    self.act_somatic[key] = change

        self.hotspot_nucleotides = set()
        self.hotspot_proteins = set()
        for l in iter_lines(compendia_fpath):
            fields = l.split('\t')
            if fields[5].startswith('g.'):
                continue
            self.hotspot_nucleotides.add('-'.join(fields[1:5]))
            if not fields[6]:
                continue
            self.hotspot_proteins.add('-'.join([fields[0], fields[6]]))

        logger.info('Parsing gene blacklists...')
        anno_cfg = get_anno_config()
        self.gene_blacklists_by_reason = parse_gene_blacklists(
            anno_cfg['blacklist']['genes'], incidentalome_dir)
        for r in self.gene_blacklists_by_reason.keys():
            self.gene_blacklist_counter[r] = 0
        self.gene_blacklist_counter['hardfilter'] = 0
        # self.gene_to_soft_filter = list(iter_lines(join(incidentalome_dir, 'soft_filter.txt')))

        # self.region_blacklists_by_reason = dict()
        # if tricky_regions_dir:
        #     info('Parsing region blacklists...')
        #     self.region_blacklists_by_reason = load_tricky_regions(anno_cfg['blacklist']['regions'], tricky_regions_dir)
        #     for r in self.region_blacklists_by_reason.keys():
        #         self.region_blacklist_counter[r] = 0

        logger.info('Parsing actionable rules and specific mutations...')
        self.tier_by_specific_mutations, self.tier_by_type_by_region_by_gene, self.sensitizations_by_gene\
            = parse_specific_mutations(specific_mutations_fpath)

        if not all([
                self.rules, self.splice_positions_by_gene, self.act_somatic,
                self.act_germline, self.actionable_hotspot_by_gene
        ]):
            if not self.rules:
                logger.err('No rules, cannot proceed')
            if not self.splice_positions_by_gene:
                logger.err('No tp53_positions, cannot proceed')
            if not self.act_somatic:
                logger.err('No act_somatic, cannot proceed')
            if not self.act_germline:
                logger.err('No act_germline, cannot proceed')
            if not self.actionable_hotspot_by_gene:
                logger.err('No actionable_hotspots, cannot proceed')

        self.status = None
        self.reason_by_status = None

        self.output_f = None
        self.fm_output_f = None
        self.rejected_output_f = None
Пример #43
0
def main(output_dir=None,
         normal_bam=None,
         tumor_bam=None,
         snv_vcf=None,
         normal_name=None,
         tumor_name=None,
         sample=None,
         genome=None,
         genomes_dir=None,
         gridss_ref_dir=None,
         ref_fa=None,
         threads=None,
         jvmheap=None):

    gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx'))
    gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts'))

    normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    sample = sample or tumor_name

    output_dir = safe_mkdir(abspath(output_dir or 'gridss'))
    logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True)
    output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf')

    assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet'

    if genomes_dir:
        refdata.find_genomes_dir(genomes_dir)
    if not gridss_ref_dir:
        gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir')
    if not ref_fa:
        ref_fa = ref_fa.get_ref_file(genome, 'fa')

    hmf_env_path = conda_utils.secondary_conda_env('hmf')

    gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0]
    amber_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0]
    cobalt_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0]
    purple_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0]
    linx_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0]

    cmd = f"""
PATH={hmf_env_path}/bin:$PATH \
THREADS={threads} \
GRIDSS_JAR={gridss_jar} \
AMBER_JAR={amber_jar} \
COBALT_JAR={cobalt_jar} \
PURPLE_JAR={purple_jar} \
LINX_JAR={linx_jar} \
bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \
-n {normal_bam} \
-t {tumor_bam} \
-v {output_vcf} \
-s {sample} \
--normal_sample {normal_name} \
--tumour_sample {tumor_name} \
--snvvcf {snv_vcf} \
--ref_dir {gridss_ref_dir} \
--install_dir {gridss_scripts_dir} \
--reference {ref_fa} \
--output_dir {output_dir} \
{f"--jvmheap {jvmheap}" if jvmheap else ""}
""".strip()

    try:
        run_simple(cmd)
    except subprocess.SubprocessError:
        err('--------\n')
        err(f'Error running GRIDSS-PURPLE-LINX.\n')
        raise
def _approve(gene_by_name, synonyms_fpath):
    approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym = \
        read_approved_genes(synonyms_fpath)

    not_approved_gene_names = list()
    gene_after_approving_by_name = OrderedDict()
    total_approved = 0
    total_not_approved = 0
    j = 0
    for g in gene_by_name.values():
        if len(g.exons) == 0:
            continue

        gene_after_approving_by_name[g.name] = g
        if is_approved_symbol(g.name, approved_gene_by_name):
            gene_after_approving_by_name[g.name] = g
            total_approved += 1
        else:
            not_approved_gene_names.append(g.name)
            total_not_approved += 1

        j += 1
        if j % 1000 == 0:
            info('processed ' + str(j / 1000) + 'k genes...')

    info('-----')
    info('Total: ' + str(j))
    if approved_gene_by_name:
        info('Total approved: ' + str(total_approved))
        info('Total not approved: ' + str(total_not_approved))
    info()
    info('Saving genes...')

    gene_features = 0
    features_counter = defaultdict(int)
    biotypes_counter = defaultdict(int)
    no_exon_gene_num = 0

    filtered_gene_after_approving_by_name = OrderedDict()
    for g in gene_after_approving_by_name.values():
        if len(g.exons) == 0:
            no_exon_gene_num += 1
        else:
            filtered_gene_after_approving_by_name[g.name] = g

            gene_features += 1
            features_counter[g.feature] += 1
            biotypes_counter[g.biotype] += 1

            for e in g.exons:
                features_counter[e.feature] += 1

                if e.feature == 'exon': e.feature = 'Exon'
                elif e.feature == 'stop_codon': e.feature = 'CDS'
                else: e.feature = e.feature[0].upper() + e.feature[1:]

    info('Skipped {} genes with no sub-features.'.format(no_exon_gene_num))
    info('Approved {} genes, including:'.format(gene_features))
    info('    Gene: {}'.format(features_counter['Gene']))
    info('    Multi_Gene: {}'.format(features_counter['Multi_Gene']))
    info('')

    info('Out of total: {} protein coding genes, {} ncRNA genes, including:'.format(
        biotypes_counter['protein_coding'], sum(biotypes_counter.values()) - biotypes_counter['protein_coding']))
    for bt, cnt in biotypes_counter.items():
        if bt != 'protein_coding':
            err('    ' + bt + ': ' + str(cnt))

    info()
    if ALL_EXONS:
        info('Found {} exons.'.format(features_counter['exon']))
    else:
        info('Also found {} CDS, {} stop codons, and {} ncRNA exons.'.format(
            features_counter['CDS'], features_counter['stop_codon'], features_counter['exon']))

    return filtered_gene_after_approving_by_name, not_approved_gene_names
def get_approved_gene_symbol(approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym,
                             gene_symbol, db_id='', db_chrom='', indent=''):
    if gene_symbol in approved_gene_by_name:
        if _check_gene_symbol(approved_gene_by_name[gene_symbol], gene_symbol, db_id, db_chrom):
            return approved_gene_by_name[gene_symbol].name, None

    info(indent + 'Gene name ' + gene_symbol + ' is not approved, searching for an approved version... ',
        ending='', print_date=False)

    def _get_approved_genes_by_kind(approved_genes, kind):
        if not approved_genes:
            return 'NOT FOUND'

        if len(approved_genes) > 1:
            approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id]

            if len(approved_genes_same_ucsc) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' +
                    db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_ucsc) == 1:
                if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom):
                    err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id,
                        print_date=False)
                    return approved_genes_same_ucsc[0].name

            # Ok, no genes with same ucsc id, or not the same chromosome for them.

            approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom]

            if len(approved_genes_same_chrom) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' +
                    db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_chrom) == 1:
                g = approved_genes_same_chrom[0]
                info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom '
                    + db_chrom + ', picking it', print_date=False)
                if _check_gene_symbol(g, gene_symbol, db_id, db_chrom):
                    return g.name
                else:
                    return 'NOT FOUND'

            if len(approved_genes_same_chrom) == 0:
                err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom '
                    + db_chrom + '', print_date=False)
                return 'NOT FOUND'

        if len(approved_genes) == 1:
            if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom):
                info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as '
                    + kind + ')', print_date=False)
                return approved_genes[0].name

        return 'NOT FOUND'

    res = _get_approved_genes_by_kind(approved_gnames_by_prev_gname.get(gene_symbol), 'prev')
    if res == 'AMBIGUOUS':
        return None, 'AMBIGUOUS\tAS PREV'
    elif res == 'NOT FOUND':
        res = _get_approved_genes_by_kind(approved_gnames_by_synonym.get(gene_symbol), 'synonym')
        if res == 'AMBIGUOUS':
            return None, res + '\tAS SYNONYM'
        if res == 'NOT FOUND':
            err(' not found.', print_date=False)
            return None, res
        else:
            info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as synonym): ' + res, print_date=False)
            return res, None
    else:
        info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as prev): ' + res, print_date=False)
        return res, None