예제 #1
0
    def _proc_file(inp_f, out_f, ctx=None):
        max_bunch_size = 1000 * 1000
        written_lines = 0
        bunch = []

        for i, line in enumerate(inp_f):
            clean_line = line.replace('\n', '')
            if clean_line:
                if ctx:
                    new_l = proc_line_fun(clean_line, i, ctx)
                else:
                    new_l = proc_line_fun(clean_line, i)
                if new_l is not None:
                    bunch.append(new_l + '\n')
                    written_lines += 1
            else:
                bunch.append(line)
                written_lines += 1

            if len(bunch) >= max_bunch_size:
                out_f.writelines(bunch)
                debug('Written lines: ' + str(written_lines))
                bunch = []

        out_f.writelines(bunch)
        debug('Written lines: ' + str(written_lines))
def main():
    options = [
        (['-g', '--genome'], dict(
            dest='genome',
            help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES),
        )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    info('Done, saved to ' + output_fpath)
예제 #3
0
def run_prank(run_id):
    project_names = run_id.split(',')
    projects = [Project.query.filter_by(name=pn).first() for pn in project_names]
    if not projects:
        log.err('Projects ' + ', '.join(project_names) + ' not found in database')
        abort(404)
    work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names)))
    safe_mkdir(work_dirpath)
    merged_fasta_fpath = merge_fasta(projects, work_dirpath)

    prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0])
    cmdl = prank_bin + ' -d=' + merged_fasta_fpath + ' -o=' + prank_out + ' -showtree'
    log.debug('Starting prank ' + cmdl)
    proc = subprocess.Popen(cmdl.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    # lines = []
    # prev_time = time.time()
    for stdout_line in iter(proc.stdout.readline, ''):
        print stdout_line.rstrip()
        # lines.append(stdout_line)
        cur_time = time.time()
        # if cur_time - prev_time > 2:
        emit('running',
            json.dumps({
                'finished': False,
                'lines': [stdout_line.rstrip()],
            })
        )
        # lines = []
    emit('running',
        json.dumps({
            'finished': True,
            'lines': [],
        })
    )
예제 #4
0
def requanitify_pizzly(pizzly_ref_fa, fusions_fasta, work_dir, fastq):
    """ Returns dict fusion-fasta-id -> {length  eff_length  est_counts   tpm}
    """
    trx_with_fusions = join(work_dir, 'transcripts_with_fusions.fasta.gz')
    kidx = join(work_dir, 'transcripts_with_fusions.kidx')

    if not isfile(trx_with_fusions):
        run_simple(
            f"cat {pizzly_ref_fa} {fusions_fasta} | gzip -c > {trx_with_fusions}"
        )

    if not isfile(kidx):
        run_simple(f"kallisto index -k31 -i {kidx} {trx_with_fusions}")

    abundance = join(work_dir, 'abundance.tsv')
    if not isfile(abundance):
        run_simple(f"kallisto quant -i {kidx} -o {work_dir} {' '.join(fastq)}")

    logger.debug(f'Reading expression from {abundance}')
    expr_by_fusion = dict()
    with open(abundance) as f:
        header = f.readline().strip().split('\t')
        for row in csv.DictReader(f, delimiter='\t', fieldnames=header):
            expr_by_fusion[row['target_id']] = row
    return expr_by_fusion
예제 #5
0
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}'

    if not fai_fpath:
        check_genome(genome)
        fai_fpath = get_fai(genome)
    else:
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

    else:
        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
예제 #6
0
def get_all_features(genome, high_confidence=False, features=None, gene_names=None, only_canonical=False):
    _canon_filt = get_only_canonical_filter(genome) if only_canonical else None

    ori_genome = genome
    genome = genome.replace('GRCh37', 'hg19')
    genome = genome.replace('GRCh38', 'hg38')

    bed = _get_ensembl_file('ensembl.bed', genome)
    def _filter(x):
        if high_confidence:
            if x[BedCols.HUGO] in ['', '.', None]:
                return False
        if features:
            if x[BedCols.FEATURE] not in features:
                return False
        if gene_names:
            if x[BedCols.GENE] not in gene_names:
                return False
        if _canon_filt:
            if not _canon_filt(x):
                return False
        return True
    debug('Filtering BEDTool for: HUGO annotation, specific features, specific genes, canonical')
    bed = bed.filter(_filter)
    if ori_genome.startswith('GRCh'):
        def fix_chr(r):
            r.chrom = r.chrom.replace('chrM', 'MT').replace('chr', '')
            return r
        bed = bed.each(fix_chr)
    return bed
예제 #7
0
def extract_features(output_file, genome, only_canonical, high_confidence, coding_only,
                     feature_types):
    """ For debug purposes
    """
    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    if high_confidence:
        features_bed = features_bed.filter(ba.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ba.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    feature_types = feature_types or ['exon', 'CDS', 'stop_codon', 'transcript']
    features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in feature_types)
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    features_bed.saveas(output_file)
    debug(f'Saved features to {output_file}')
예제 #8
0
def convert_file(work_dir, input_fpath, convert_file_fn, suffix=None, output_fpath=None,
                 check_result=True, overwrite=False, reuse=True, ctx=None):
    assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix)
    output_fpath = output_fpath or intermediate_fname(work_dir, input_fpath, suf=suffix)
    if output_fpath.endswith('.gz'):
        debug('output_fpath is .gz, but writing to uncompressed.')
        output_fpath = splitext(output_fpath)[0]
    
    if not overwrite:
        if can_reuse(output_fpath, cmp_f=input_fpath):
            debug('Reusing ' + output_fpath)
            return output_fpath
        if can_reuse(output_fpath + '.gz', cmp_f=input_fpath):
            debug('Reusing ' + output_fpath + '.gz')
            return output_fpath
    
    if islink(output_fpath):
        os.unlink(output_fpath)

    debug('Writing to ' + output_fpath)
    with file_transaction(work_dir, output_fpath) as tx_fpath:
        with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f:
            if ctx:
                convert_file_fn(inp_f, out_f, ctx)
            else:
                convert_file_fn(inp_f, out_f)

    if suffix or output_fpath:
        debug('Saved to ' + output_fpath)

    verify_file(output_fpath, is_critical=check_result)
    return output_fpath
예제 #9
0
def check_md5(work_dir, fpath, file_type, silent=False):
    md5_fpath = join(work_dir, file_type + '_md5.txt')
    new_md5 = md5(fpath)
    info('md5 of ' + fpath + ' is ' + str(new_md5))
    prev_md5 = None
    if isfile(md5_fpath):
        with open(md5_fpath) as f:
            prev_md5 = f.read()
    else:
        info('Previous md5 file ' + md5_fpath + ' does not exist')
    info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5))

    if prev_md5 == new_md5:
        if not silent:
            debug('Reusing previous ' + file_type.upper() + ' files.')
        return True
    else:
        if not silent:
            info('Pre-processing input ' + file_type.upper() + ' file')
        if prev_md5:
            if not silent:
                info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5))
                info('New ' + file_type.upper() + ' md5: ' + str(new_md5))

        with open(md5_fpath, 'w') as f:
            f.write(str(new_md5))
        return False
예제 #10
0
def run_analysis_socket_handler(project_names_line):
    log.debug('Recieved request to start analysis for ' + project_names_line)
    ws = request.environ.get('wsgi.websocket', None)
    if not ws:
        raise RuntimeError('Environment lacks WSGI WebSocket support')

    def _run_cmd(cmdl):
        log.debug(cmdl)
        proc = subprocess.Popen(cmdl.split(),
                                stderr=subprocess.STDOUT,
                                stdout=subprocess.PIPE,
                                env=os.environ)
        for stdout_line in iter(proc.stdout.readline, None):
            if not stdout_line:
                break
            if not six.PY2:
                stdout_line = stdout_line.decode()
            if '#(' not in stdout_line.strip():
                _send_line(ws, stdout_line)
        log.debug('Exit from the subprocess')

    manage_py = abspath(join(dirname(__file__), '..', 'manage.py'))
    _run_cmd(sys.executable + ' ' + manage_py + ' analyse_projects ' +
             project_names_line)
    run = Run.find_by_project_names_line(project_names_line)
    if not run:
        _send_line(ws,
                   'Run ' + str(run.id) + ' for projects ' +
                   project_names_line +
                   ' cannot be found. Has genotyping been failed?',
                   error=True)

    ws.send(json.dumps({'finished': True}))
    return ''
예제 #11
0
def convert_file(work_dir, input_fpath, convert_file_fn, suffix=None, output_fpath=None,
                 check_result=True, overwrite=False, reuse=True, ctx=None):
    assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix)
    output_fpath = output_fpath or intermediate_fname(work_dir, input_fpath, suf=suffix)
    if output_fpath.endswith('.gz'):
        debug('output_fpath is .gz, but writing to uncompressed.')
        output_fpath = splitext(output_fpath)[0]
    
    if not overwrite:
        if can_reuse(output_fpath, cmp_f=input_fpath):
            debug('Reusing ' + output_fpath)
            return output_fpath
        if can_reuse(output_fpath + '.gz', cmp_f=input_fpath):
            debug('Reusing ' + output_fpath + '.gz')
            return output_fpath
    
    if islink(output_fpath):
        os.unlink(output_fpath)

    debug('Writing to ' + output_fpath)
    with file_transaction(work_dir, output_fpath) as tx_fpath:
        with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f:
            if ctx:
                convert_file_fn(inp_f, out_f, ctx)
            else:
                convert_file_fn(inp_f, out_f)

    if suffix or output_fpath:
        debug('Saved to ' + output_fpath)

    verify_file(output_fpath, is_critical=check_result)
    return output_fpath
예제 #12
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
예제 #13
0
    def _proc_file(inp_f, out_f, ctx=None):
        max_bunch_size = 1000 * 1000
        written_lines = 0
        bunch = []

        for i, line in enumerate(inp_f):
            clean_line = line.replace('\n', '')
            if clean_line:
                if ctx:
                    new_l = proc_line_fun(clean_line, i, ctx)
                else:
                    new_l = proc_line_fun(clean_line, i)
                if new_l is not None:
                    bunch.append(new_l + '\n')
                    written_lines += 1
            else:
                bunch.append(line)
                written_lines += 1

            if len(bunch) >= max_bunch_size:
                out_f.writelines(bunch)
                debug('Written lines: ' + str(written_lines))
                bunch = []

        out_f.writelines(bunch)
        debug('Written lines: ' + str(written_lines))
예제 #14
0
def set_up_log(log_dir, log_fname):
    log_fpath = join(log_dir, log_fname)
    logger.set_log_path(log_fpath, save_previous=True)

    debug('Logging to ' + log_fpath)
    debug()
    return log_fpath
예제 #15
0
def set_up_log(log_dir, log_fname):
    log_fpath = join(log_dir, log_fname)
    logger.set_log_path(log_fpath, save_previous=True)

    debug('Logging to ' + log_fpath)
    debug()
    return log_fpath
예제 #16
0
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}'

    if not fai_fpath:
        check_genome(genome)
        fai_fpath = get_fai(genome)
    else:
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

    else:
        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
예제 #17
0
def check_md5(work_dir, fpath, file_type, silent=False):
    md5_fpath = join(work_dir, file_type + '_md5.txt')
    new_md5 = md5(fpath)
    info('md5 of ' + fpath + ' is ' + str(new_md5))
    prev_md5 = None
    if isfile(md5_fpath):
        with open(md5_fpath) as f:
            prev_md5 = f.read()
    else:
        info('Previous md5 file ' + md5_fpath + ' does not exist')
    info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5))

    if prev_md5 == new_md5:
        if not silent:
            debug('Reusing previous ' + file_type.upper() + ' files.')
        return True
    else:
        if not silent:
            info('Pre-processing input ' + file_type.upper() + ' file')
        if prev_md5:
            if not silent:
                info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5))
                info('New ' + file_type.upper() + ' md5: ' + str(new_md5))

        with open(md5_fpath, 'w') as f:
            f.write(str(new_md5))
        return False
예제 #18
0
def get_parallel_view(n_samples, parallel_cfg):
    if parallel_cfg.scheduler and parallel_cfg.threads > 1:
        debug('Starting' + (' test' if not is_cluster() else '') + ' cluster (scheduler: ' + parallel_cfg.scheduler + ', queue: ' + parallel_cfg.queue + ') '
              'using ' + str(parallel_cfg.num_jobs(n_samples)) + ' nodes, ' + str(parallel_cfg.cores_per_job(n_samples)) + ' threads per each sample')
        return ClusterView(n_samples, parallel_cfg)
    else:
        debug('Running locally using ' + str(parallel_cfg.num_jobs(n_samples)) + ' thread(s)')
        return ThreadedView(n_samples, parallel_cfg)
예제 #19
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
예제 #20
0
def get_merged_cds(genome):
    """
    Returns all CDS merged, used:
    - for TargQC general reports CDS coverage statistics for WGS
    - for Seq2C CNV calling when no capture BED available
    """
    bed = get_all_features(genome)
    debug('Filtering BEDTool for high confidence CDS and stop codons')
    return bed\
        .filter(lambda r: r.fields[BedCols.FEATURE] in ['CDS', 'stop_codon'])\
        .filter(high_confidence_filter)\
        .merge()
예제 #21
0
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None):
    """ Creates output_dir, work_dir, and sets up log
    """
    output_dir = safe_mkdir(adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir')
    debug('Saving results into ' + output_dir)

    work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory')
    info('Using work directory ' + work_dir)

    log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log')

    return output_dir, work_dir, log_fpath
예제 #22
0
def get_parallel_view(n_samples, parallel_cfg):
    if parallel_cfg.scheduler and parallel_cfg.threads > 1:
        debug('Starting' + (' test' if not is_cluster() else '') +
              ' cluster (scheduler: ' + parallel_cfg.scheduler + ', queue: ' +
              parallel_cfg.queue + ') '
              'using ' + str(parallel_cfg.num_jobs(n_samples)) + ' nodes, ' +
              str(parallel_cfg.cores_per_job(n_samples)) +
              ' threads per each sample')
        return ClusterView(n_samples, parallel_cfg)
    else:
        debug('Running locally using ' +
              str(parallel_cfg.num_jobs(n_samples)) + ' thread(s)')
        return ThreadedView(n_samples, parallel_cfg)
예제 #23
0
    def __init__(self, input_dir=None, silent=False, include_samples=None, exclude_samples=None,
                 genome_build=None, **kwargs):
        BaseProject.__init__(self, input_dir=input_dir, **kwargs)
        self.genome_build = genome_build

        debug(f'Parsing project {input_dir}')
        self.batch_by_name = DragenProject.find_batches(self.dir, silent=silent,
            include_samples=include_samples, exclude_samples=exclude_samples, parent_project=self)

        if len(self.batch_by_name) == 1:
            self.project_name = list(self.batch_by_name.values())[0].name
        else:
            self.project_name = basename(input_dir)
예제 #24
0
 def _run_cmd(cmdl):
     log.debug(cmdl)
     proc = subprocess.Popen(cmdl.split(),
                             stderr=subprocess.STDOUT,
                             stdout=subprocess.PIPE,
                             env=os.environ)
     for stdout_line in iter(proc.stdout.readline, None):
         if not stdout_line:
             break
         if not six.PY2:
             stdout_line = stdout_line.decode()
         if '#(' not in stdout_line.strip():
             _send_line(ws, stdout_line)
     log.debug('Exit from the subprocess')
예제 #25
0
def safe_symlink_to(fpath, dst_dirpath, rel=False):
    if rel:
        fpath = os.path.relpath(fpath, dst_dirpath)

    dst = join(dst_dirpath, basename(fpath))
    if not exists(dst):
        try:
            if os.lstat(dst):  # broken symlink
                os.remove(dst)
        except OSError:
            pass
        debug('Symlink ' + fpath + ' -> ' + dst)
        os.symlink(fpath, dst)
    return dst
예제 #26
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
예제 #27
0
def can_reuse(fpath, cmp_f, silent=False):
    """Check if a file `fpath` exists, is non-empty and is more recent than `cmp_f`
    """
    do_reuse = os.environ.get('REUSE', '1')
    if do_reuse == '0':
        return False
    if not fpath or not isfile(fpath):
        return False
    elif verify_file(fpath, cmp_f=cmp_f, silent=True):
        if not silent:
            debug('Reusing ' + fpath)
        return True
    else:
        return False
예제 #28
0
def get_or_create_run(projects, parall_view=None):
    genomes = set([p.genome for p in projects])
    if len(genomes) > 1:
        log.critical('Error: multiple genomes in projects: ' + str(genomes))
    run = Run.find_by_projects(projects)

    if run and run.rerun_on_usercall:
        log.info()
        log.info('Rebuilding tree on usercall')
        build_tree(run)
        run.rerun_on_usercall = False
        db.session.commit()
        return run

    if run and not Run.is_ready(run):
        log.debug('Tree files do not exist, recreating run for projects ' +
                  ', '.join(p.name for p in projects))
        db.session.delete(run)
        db.session.commit()
        run = None

    if run:
        log.debug('Found run for ' + ', '.join([p.name for p in projects]) +
                  ' with ID ' + str(run.id))
    else:
        log.debug('Creating new run for projects ' +
                  ', '.join(p.name for p in projects))
        run = Run.create(projects, parall_view)
        log.debug('Done creating new run with ID ' + str(run.id))
    return run
예제 #29
0
def can_reuse(fpath, cmp_f, silent=False):
    """Check if a file `fpath` exists, is non-empty and is more recent than `cmp_f`
    """
    do_reuse = os.environ.get('REUSE', '1')
    if do_reuse == '0':
        return False
    if not fpath or not isfile(fpath):
        return False
    elif verify_file(fpath, cmp_f=cmp_f, silent=True):
        if not silent:
            debug('Reusing ' + fpath)
        return True
    else:
        return False
예제 #30
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and
                         not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
예제 #31
0
def safe_symlink_to(fpath, dst_dirpath, rel=False):
    if rel:
        fpath = os.path.relpath(fpath, dst_dirpath)

    dst = join(dst_dirpath, basename(fpath))
    if not exists(dst):
        try:
            if os.lstat(dst):  # broken symlink
                os.remove(dst)
        except OSError:
            pass
        debug('Symlink ' + fpath + ' -> ' + dst)
        os.symlink(fpath, dst)
    return dst
예제 #32
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        import pybedtools
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = pybedtools.BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
예제 #33
0
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None):
    """ Creates output_dir, work_dir, and sets up log
    """
    output_dir = safe_mkdir(
        adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir')
    debug('Saving results into ' + output_dir)

    work_dir = safe_mkdir(work_dir or join(output_dir, 'work'),
                          'working directory')
    info('Using work directory ' + work_dir)

    log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')),
                           proc_name + '.log')

    return output_dir, work_dir, log_fpath
예제 #34
0
def main(input_bed, output_file, output_features=False, genome=None,
         only_canonical=False, short=False, extended=False, high_confidence=False,
         ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False):
    """ Annotating BED file based on reference features annotations.
    """
    logger.init(is_debug_=is_debug)

    if not genome:
        raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome')

    if short:
        if extended:        raise click.BadParameter('--short and --extended can\'t be set both', param='extended')
        if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features')
    elif output_features or extended:
        extended = True
        short    = False

    if not verify_file(input_bed):
        click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed')
    input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}')

    if work_dir:
        work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0])
        safe_mkdir(work_dir)
        info(f'Created work directory {work_dir}')
    else:
        work_dir = mkdtemp('bed_annotate')
        debug('Created temporary work directory {work_dir}')

    input_bed = clean_bed(input_bed, work_dir)
    input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning')

    output_file = adjust_path(output_file)

    output_file = annotate(
        input_bed, output_file, work_dir, genome=genome,
        only_canonical=only_canonical, short=short, extended=extended,
        high_confidence=high_confidence, collapse_exons=collapse_exons,
        output_features=output_features,
        ambiguities_method=ambiguities_method, coding_only=coding_only,
        is_debug=is_debug)

    if not work_dir:
        debug(f'Removing work directory {work_dir}')
        shutil.rmtree(work_dir)

    info(f'Done, saved to {output_file}')
예제 #35
0
def main():
    options = [
        (['-g', '--genome'],
         dict(
             dest='genome',
             help='Genome build. Accepted values: ' +
             ', '.join(ebl.SUPPORTED_GENOMES),
         )),
        (['-c', '--canonical'],
         dict(
             dest='canonical',
             action='store_true',
             help='Use canonical only',
         )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        logger.critical(
            'Error: please, specify genome build name with -g (e.g. `-g hg19`)'
        )
    genome = opts.genome

    logger.debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        logger.critical('Genome ' + genome + ' is not supported. Supported: ' +
                        ', '.join(ebl.SUPPORTED_GENOMES))

    logger.warn('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(
        lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    if opts.canonical:
        features_bed = features_bed.filter(
            ebl.get_only_canonical_filter(genome))

    logger.warn('Saving CDS regions...')
    output_fpath = adjust_path(
        join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    logger.warn('Done, saved to ' + output_fpath)
예제 #36
0
파일: gtf.py 프로젝트: pdiakumis/NGS_Utils
def get_gtf_db(gtf, in_memory=False):
    """
    create a gffutils DB
    """
    db_file = gtf + '.db'
    if gtf.endswith('.gz'):
        db_file = gtf[:-3] + '.db'
    if file_exists(db_file):
        return gffutils.FeatureDB(db_file)
    db_file = ':memory:' if in_memory else db_file
    if in_memory or not file_exists(db_file):
        debug('GTF database does not exist, creating...')
        infer_extent = guess_infer_extent(gtf)
        db = gffutils.create_db(gtf, dbfn=db_file,
                                infer_gene_extent=infer_extent)
        return db
    else:
        return gffutils.FeatureDB(db_file)
예제 #37
0
    def calc_genomic_bp_pos(self):
        genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord(
            self.trx, self.bp_offset)
        if genomic_coord is None:
            logger.critical(f'  Error: could not convert transcript {id} '
                            f'offset {genomic_coord} to genomic coordinate')
            return False

        if genomic_coord == -1:
            logger.debug(
                f'  Fusion in takes the entire transcript {self.trx.id} '
                f'(genomic_coord={genomic_coord}, bp_offset={self.bp_offset}). '
                f'That\'s suspicious, so we are skipping it.')
            return False

        self.bp_genomic_pos = genomic_coord
        self.bp_is_in_intron = is_in_intron
        return True
예제 #38
0
def run_processing(project_names_line, redirect_to, email=None):
    pnames = project_names_line.split('--')

    log.debug(f'Recieved request to start analysis for {project_names_line}')
    run_id = hashlib.sha256(str(project_names_line).encode()).hexdigest()
    run_log = join(DATA_DIR, f'run_{run_id}.log')
    if isfile(run_log):
        msg = f'''<p>Run for projects {project_names_line} already started. Please, wait until it finished. 
                  <br>
                  <p>Follow the log at:</p>
                  <pre>{run_log}</pre>
                  <p>And reload the page when it\'s finished.</p>'''
    else:
        manage_py = abspath(join(dirname(__file__), '..', 'manage.py'))
        vardict = which('vardict')
        assert vardict, 'vardict is not in PATH. Are you running from "clearup" environment?'
        back_url = f"http://{clearup.HOST_IP}:{clearup.PORT}{redirect_to}"
        cmdl = f'{sys.executable} {manage_py} analyse_projects {project_names_line} --back_url={back_url}'
        if email:
            cmdl += f' --email={email}'
        log.debug(cmdl)
        process = subprocess.Popen(cmdl,
                                   stderr=subprocess.STDOUT,
                                   stdout=open(run_log, 'w'),
                                   env=os.environ,
                                   close_fds=True,
                                   shell=True)

        msg = f'''<p>Starting analysis with a command line:</p>
                  <pre>{cmdl}</pre>
                  <p>Process is running under ID={process.pid}. Follow the log at:</p>
                  <pre>{run_log}</pre>
                  <p>And reload the page when it\'s finished.</p>'''

    if email:
        msg += f'<br>When the run finished, you will be notified by an email sent to {email}'

    return render_template('submitted.html',
                           projects=pnames,
                           title='Comparing projects ' + ', '.join(pnames),
                           project_names_line=project_names_line,
                           redirect_to=redirect_to,
                           message=msg)
예제 #39
0
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx)

    return output_bed_fpath
예제 #40
0
def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path
예제 #41
0
def _sex_from_x_snps(vcf_file):
    log.debug('Calling sex from ' + vcf_file)
    het_calls_num = 0
    hom_calls_num = 0
    for rec in VCF(vcf_file):
        if rec.CHROM == 'chrX':
            if rec.num_het > 0:
                het_calls_num += 1
            if rec.num_hom > 0:
                hom_calls_num += 1

    if het_calls_num + hom_calls_num > 10:
        if het_calls_num > 1.5 * hom_calls_num:
            return 'F'
        elif het_calls_num < 0.5 * hom_calls_num:
            return 'M'
        else:
            log.debug(
                'het/hom ratio on chrX is ' +
                str(het_calls_num / hom_calls_num) +
                ' - between 1.5 and 0.5, not confident enough to call sex.')
    else:
        log.debug('Total chrX calls number is ' +
                  str(het_calls_num + hom_calls_num) +
                  ' - less than 10, not confident enough to call sex.')
    return None
예제 #42
0
def phylo_tree_page(run_id):
    project_names = run_id.split(',')
    projects = [Project.query.filter_by(name=pn).first() for pn in project_names]
    if not projects:
        log.err('Projects ' + ', '.join(project_names) + ' not found in database')
        abort(404)
    color_by_proj = {p.name: PROJ_COLORS[i % len(PROJ_COLORS)] for i, p in enumerate(projects)}
    work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names)))
    safe_mkdir(work_dirpath)
    merged_fasta_fpath = merge_fasta(projects, work_dirpath)

    prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0])
    tree_fpath = os.path.join(prank_out + '.best.dnd')
    if not can_reuse(tree_fpath, merged_fasta_fpath):
        return render_template(
            'processing.html',
            projects=[{
                'name': p.name,
            } for i, p in enumerate(projects)],
            run_id=run_id,
            title='Processing ' + ', '.join(project_names),
        )

    log.debug('Prank results found, rendering tree!')
    tree = next(Phylo.parse(tree_fpath, 'newick'))
    seq_by_id = read_fasta(merged_fasta_fpath)
    tree_json = tree_to_json_for_d3(tree, seq_by_id, color_by_proj, run_id=run_id)

    all_samples_count = sum(len(p.samples.all()) for p in projects)
    return render_template(
        'tree.html',
        projects=[{
            'name': p.name,
            'color': color_by_proj[p.name],
        } for i, p in enumerate(projects)],
        title=', '.join(project_names),
        data=tree_json,
        tree_height=20 * all_samples_count,
        tree_width=5 * all_samples_count,
    )
예제 #43
0
파일: bcbio.py 프로젝트: vladsaveliev/Utils
    def find_bam(self, silent=False):
        name = self.get_name_for_files()

        to_try = [
            '-ready.cram',
            '-ready.bam',
            '-sort.bam',
        ]
        for ext in to_try:
            fpath = adjust_path(join(self.dirpath, name + ext))
            if verify_file(fpath):
                return fpath

        input_file = self.sample_info['files']
        if not isinstance(input_file, str):
            input_file = input_file[0]
        if isinstance(input_file, str) and input_file.endswith('.bam'):
            debug('Bcbio was run from BAM input')
            if not input_file.startswith('/'):
                input_file = abspath(join(self.bcbio_project.work_dir, input_file))
            if verify_file(input_file):
                debug('Using BAM file from input YAML ' + input_file)
                return input_file
            else:
                debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist')

        if not silent:
            warn('No BAM or CRAM file found for ' + self.name)
예제 #44
0
    def find_bam(self, silent=False):
        name = self.get_name_for_files()

        to_try = [
            '-ready.bam',
            '-ready.cram',
            '-sort.bam',
        ]
        for ext in to_try:
            fpath = adjust_path(join(self.dirpath, name + ext))
            if verify_file(fpath):
                return fpath

        input_file = self.sample_info['files']
        if not isinstance(input_file, str):
            input_file = input_file[0]
        if isinstance(input_file, str) and input_file.endswith('.bam'):
            debug('Bcbio was run from BAM input')
            if not input_file.startswith('/'):
                input_file = abspath(join(self.parent_project.work_dir, input_file))
            if verify_file(input_file):
                debug('Using BAM file from input YAML ' + input_file)
                return input_file
            else:
                debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist')

        if not silent:
            warn('No BAM or CRAM file found for ' + self.name)
예제 #45
0
def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath
예제 #46
0
파일: bcbio.py 프로젝트: vladsaveliev/Utils
 def find_raw_vcf(self, silent=False, caller=None):
     caller = caller or self.bcbio_project.somatic_caller
     vcf_fpath = None
     if self.batch and self.phenotype != 'normal':
         vcf_fpath = self.bcbio_project.find_vcf_file(self.batch.name, silent=silent, caller=caller)
     if not vcf_fpath:  # in sample dir?
         if not silent:
             debug('-')
             debug('Not found VCF in the datestamp dir, looking at the sample-level dir')
             debug('-')
         vcf_fpath = self.bcbio_project.find_vcf_file_from_sample_dir(
             self, silent=silent or self.phenotype == 'normal', caller=caller)
     return vcf_fpath
예제 #47
0
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical('Either of chr_order, fai_fpath, or genome build name must be specified')
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath)
    return output_bed_fpath
예제 #48
0
 def __init__(self, n_samples, parallel_cfg):
     BaseView.__init__(self, n_samples, parallel_cfg)
     self._view = CV(**parallel_cfg.get_cluster_params(n_samples))
     debug('Starting cluster with ' + str(self.num_jobs) + ' open nodes, ' + str(self.cores_per_job) + ' cores per node')
예제 #49
0
파일: sex.py 프로젝트: vladsaveliev/Utils
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
예제 #50
0
 def run(self, fn, param_lists):
     debug('Starting multithreaded function' + str(fn))
     assert self.n_samples == len(param_lists)
     return self._view(delayed(fn)(*params) for params in param_lists)
예제 #51
0
파일: bcbio.py 프로젝트: vladsaveliev/Utils
    def load_from_sample_info(sample_info, bcbio_project, exclude_samples=None,
                              include_samples=None, extra_batches=None, silent=False):
        # Get sample and batch names and exclude/include based on exclude_samples and include_samples
        description = str(sample_info['description']).replace('.', '_')

        batch_names = sample_info.get('metadata', dict()).get('batch')
        if isinstance(batch_names, int) or isinstance(batch_names, float):
            batch_names = str(batch_names)
        if isinstance(batch_names, str):
            batch_names = [batch_names]
        batch_names = [b.replace('.', '_') for b in batch_names if b]

        if exclude_samples:
            # Sample name
            if description in exclude_samples:
                if not silent: info(f'Skipping sample {description}')
                return None
            # Batch names
            if batch_names:
                filtered_batch_names = [b for b in batch_names if b not in exclude_samples]
                if not filtered_batch_names:
                    if not silent: info(f'Skipping sample {description} with batch info {", ".join(batch_names)}')
                    return None
                batch_names = filtered_batch_names

        if include_samples:
            # Sample name
            if description in include_samples:
                if not silent: info(f'Using sample {description} and all samples sharing batches {batch_names}')
            else:
                # Batch names
                if batch_names:
                    incl_batch_names = [b for b in batch_names if b in include_samples]
                    if incl_batch_names:
                        if not silent: info(f'Using sample {description} with batch info {", ".join(batch_names)}')
                    extr_batch_names = [b for b in batch_names if extra_batches and b in extra_batches]
                    if extr_batch_names and not incl_batch_names:
                        if not silent: info(f'Using sample {description} as it shares batches {extr_batch_names} with included samples')
                    incl_batch_names += extr_batch_names

                    if incl_batch_names:
                        batch_names = incl_batch_names
                    else:
                        return None

        # Creating BcbioSample object
        s = BcbioSample(bcbio_project)
        s.sample_info = sample_info
        if 'description_original' in sample_info:
            s.old_name = str(sample_info['description_original']).replace('.', '_')

        # Setting phenotype and batches
        s.phenotype = sample_info.get('metadata', dict()).get('phenotype', 'tumor')
        if not batch_names:
            batch_names = [s.get_name_for_files() + '-batch']
        if len(batch_names) > 1 and s.phenotype != 'normal':
            critical('Multiple batches for non-normal ' + s.phenotype + ' sample ' + s.name + ': ' + ', '.join(batch_names))
        s.batch_names = batch_names

        # Setting genome build based reference paths
        s.genome_build = sample_info['genome_build']
        s.variant_regions_bed = s.bcbio_project.config_path(val=sample_info['algorithm'].get('variant_regions'))
        s.sv_regions_bed = s.bcbio_project.config_path(val=sample_info['algorithm'].get('sv_regions')) or s.variant_regions_bed
        s.coverage_bed = s.bcbio_project.config_path(val=sample_info['algorithm'].get('coverage')) or s.sv_regions_bed
        if s.coverage_bed and not isfile(s.coverage_bed):
            if not silent:
                debug('coverage bed ' + str(s.coverage_bed) + ' not found. Looking relatively to genomes "basedir"')
            try:
                import az
            except ImportError:
                pass
            else:
                genome_cfg = az.get_refdata(s.genome_build)
                ref_basedir = genome_cfg.get('basedir')
                if not ref_basedir:
                    critical('coverage bed ' + str(s.coverage_bed) + ' not found and "basedir" not provided in system config')
                s.coverage_bed = join(ref_basedir, 'coverage', 'prioritize', s.coverage_bed) + '.bed'

        s.is_rnaseq = 'rna' in sample_info['analysis'].lower()
        s.min_allele_fraction = (1.0/100) * float(sample_info['algorithm'].get('min_allele_fraction', 1.0))
        if s.variant_regions_bed is None:
            s.coverage_interval = 'genome'
        else:
            s.coverage_interval = 'regional'
        s.is_wgs = s.coverage_interval == 'genome'

        if s._set_name_and_paths(
            name=description,
            variantcallers_data=sample_info['algorithm'].get('variantcaller'),
            ensemble='ensemble' in sample_info['algorithm'],
            silent=silent):
            return s
        else:
            return None
예제 #52
0
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num,
              high_confidence=False, reannotate=False, is_debug=False, **kwargs):
    # if genome:
        # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2'))
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')')
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0])
    # else:

    intersection_bed = None
    intersection_fpath = None
    
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    if is_debug:
        intersection_fpath = join(work_dir, 'intersection.bed')
        if isfile(intersection_fpath):
            info('Loading from ' + intersection_fpath)
            intersection_bed = BedTool(intersection_fpath)
    if not intersection_bed:
        if count_bed_cols(fai_fpath) == 2:
            debug('Fai fields size is 2 ' + fai_fpath)
            intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath)
        else:
            debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2')
            intersection_bed = bed.intersect(ref_bed, wao=True)
    if is_debug and not isfile(intersection_fpath):
        intersection_bed.saveas(intersection_fpath)
        debug('Saved intersection to ' + intersection_fpath)

    total_annotated = 0
    total_uniq_annotated = 0
    total_off_target = 0

    met = set()

    overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list)))
    # off_targets = list()

    expected_fields_num = ori_col_num + len(ba.BedCols.cols[:-4]) + 1
    for i, intersection_fields in enumerate(intersection_bed):
        inters_fields_list = list(intersection_fields)
        if len(inters_fields_list) < expected_fields_num:
            critical(
                f'Cannot parse the reference BED file - unexpected number of lines '
                f'({len(inters_fields_list)} in {inters_fields_list} (less than {expected_fields_num})')

        a_chr, a_start, a_end = intersection_fields[:3]
        a_extra_columns = intersection_fields[3:ori_col_num]

        overlap_fields = [None for _ in ba.BedCols.cols]

        overlap_fields[:len(intersection_fields[ori_col_num:])] = intersection_fields[ori_col_num:]
        keep_gene_column = not reannotate
        a_gene = None
        if keep_gene_column:
            a_gene = a_extra_columns[0]

        e_chr = overlap_fields[0]
        overlap_size = int(intersection_fields[-1])
        assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}'

        # fs = [None for _ in ebl.BedCols.cols]
        # fs[:3] = [a_chr, a_start, a_end]
        reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns))

        if e_chr == '.':
            total_off_target += 1
            # off_targets.append(fs)
            overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)

        else:
            # fs[3:-1] = db_feature_fields[3:-1]
            total_annotated += 1
            if (a_chr, a_start, a_end) not in met:
                total_uniq_annotated += 1
                met.add((a_chr, a_start, a_end))

            e_gene = overlap_fields[ba.BedCols.GENE] if not high_confidence else overlap_fields[ba.BedCols.HUGO]
            if keep_gene_column and e_gene != a_gene:
                overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)
            else:
                transcript_id = overlap_fields[ba.BedCols.ENSEMBL_ID]
                overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size))

    info('  Total annotated regions: ' + str(total_annotated))
    info('  Total unique annotated regions: ' + str(total_uniq_annotated))
    info('  Total off target regions: ' + str(total_off_target))
    info('Resolving ambiguities...')
    annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs)

    return annotated
예제 #53
0
파일: bcbio.py 프로젝트: vladsaveliev/Utils
    def find_vcf_file(self, batch_name, silent=False, caller=None):
        caller = caller or self.somatic_caller
        vcf_fname = batch_name + '-' + caller + '.vcf'
        annot_vcf_fname = batch_name + '-' + caller + '-annotated.vcf'

        vcf_annot_fpath_gz = adjust_path(join(self.date_dir, annot_vcf_fname + '.gz'))  # in datestamp
        var_raw_vcf_annot_fpath_gz = adjust_path(join(self.raw_var_dir, annot_vcf_fname + '.gz'))  # in datestamp/var/raw

        vcf_fpath_gz = adjust_path(join(self.date_dir, vcf_fname + '.gz'))  # in datestamp
        var_vcf_fpath_gz = adjust_path(join(self.var_dir, vcf_fname + '.gz'))  # in datestamp/var
        var_raw_vcf_fpath_gz = adjust_path(join(self.raw_var_dir, vcf_fname + '.gz'))  # in datestamp/var/raw

        vcf_fpath = adjust_path(join(self.date_dir, vcf_fname))  # in datestamp
        var_vcf_fpath = adjust_path(join(self.var_dir, vcf_fname))  # in datestamp/var
        var_raw_vcf_fpath = adjust_path(join(self.raw_var_dir, vcf_fname))  # in datestamp/var/raw

        if isfile(vcf_annot_fpath_gz):
            verify_file(vcf_annot_fpath_gz, is_critical=True)
            if not silent: info('Found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz)
            return vcf_annot_fpath_gz
        else:
            debug('Not found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz)

        if isfile(var_raw_vcf_annot_fpath_gz):
            verify_file(var_raw_vcf_annot_fpath_gz, is_critical=True)
            if not silent: info('Found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz)
            return var_raw_vcf_annot_fpath_gz
        else:
            debug('Not found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz)

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp dir ' + vcf_fpath_gz)
            return vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp dir ' + vcf_fpath_gz)

        if isfile(var_raw_vcf_fpath_gz):
            verify_file(var_raw_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz)
            return var_raw_vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz)

        if isfile(vcf_fpath):
            verify_file(vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp dir ' + vcf_fpath)
            return vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp dir ' + vcf_fpath)

        if isfile(var_raw_vcf_fpath):
            verify_file(var_raw_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath)
            return var_raw_vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath)

        if isfile(var_vcf_fpath_gz):
            verify_file(var_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp/var dir ' + var_vcf_fpath_gz)
            return var_vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp/var dir ' + var_vcf_fpath_gz)

        if isfile(var_vcf_fpath):
            verify_file(var_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath)
            return var_vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath)

        if not silent:
            warn('Warning: no VCF found for batch ' + batch_name + ', ' + caller + ', gzip or '
                'uncompressed version in the datestamp directory.')
        return None
예제 #54
0
파일: bcbio.py 프로젝트: vladsaveliev/Utils
    def find_vcf_file_from_sample_dir(sample, silent=False, caller=None):
        caller = caller or sample.bcbio_project.somatic_caller
        vcf_fname = sample.get_name_for_files() + '-' + caller + '.vcf'

        sample_var_dirpath = join(sample.dirpath, 'var')
        vcf_fpath_gz = adjust_path(join(sample.dirpath, vcf_fname + '.gz'))  # in var
        var_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, vcf_fname + '.gz'))  # in var
        var_raw_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname + '.gz'))  # in var
        vcf_fpath = adjust_path(join(sample.dirpath, vcf_fname))
        var_vcf_fpath = adjust_path(join(sample_var_dirpath, vcf_fname))  # in var
        var_raw_vcf_fpath = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname))  # in var

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF ' + vcf_fpath_gz)
            return vcf_fpath_gz
        else:
            debug('Not found VCF ' + vcf_fpath_gz)

        if isfile(var_vcf_fpath_gz):
            verify_file(var_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the var/ dir ' + var_vcf_fpath_gz)
            return var_vcf_fpath_gz
        else:
            debug('Not found VCF in the var/ dir ' + var_vcf_fpath_gz)

        if isfile(var_raw_vcf_fpath_gz):
            verify_file(var_raw_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz)
            return var_raw_vcf_fpath_gz
        else:
            debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz)

        if isfile(vcf_fpath):
            verify_file(vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF ' + vcf_fpath)
            return vcf_fpath
        else:
            debug('Not found uncompressed VCF ' + vcf_fpath)

        if isfile(var_vcf_fpath):
            verify_file(var_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the var/ dir ' + var_vcf_fpath)
            return var_vcf_fpath
        else:
            debug('Not found VCF in the var/ dir ' + var_vcf_fpath)

        if isfile(var_raw_vcf_fpath):
            verify_file(var_raw_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the var/raw/ dir ' + var_raw_vcf_fpath)
            return var_raw_vcf_fpath
        else:
            debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath)

        if not silent:
            warn('Warning: no VCF found for ' + sample.name + ' (' + caller + '), gzip or uncompressed version in and outside '
                'the var directory. Phenotype is ' + str(sample.phenotype))
        return None
예제 #55
0
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)

    if high_confidence:
        features_bed = features_bed.filter(ba.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ba.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
                                       x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath