def get_vcf_readers(mutations_by_experiment, cur_group_num):
    vcf_readers, filt_vcf_readers = dict(), dict()
    for e, muts in mutations_by_experiment.items():
        if not cur_group_num or get_group_num(e.key) == cur_group_num:
            variant_caller = 'vardict' if 'vardict' in e.sample.variantcallers else 'vardict-java'
            if e.sample.vcf_by_callername.get(variant_caller):
                vcf_fpath = e.sample.vcf_by_callername.get(variant_caller)
                filt_vcf_fpath = e.sample.find_filt_vcf_by_callername(
                    variant_caller)
                if vcf_fpath:
                    vcf_readers[e] = vcf.Reader(open_gzipsafe(vcf_fpath, 'r'))
                if filt_vcf_fpath:
                    filt_vcf_readers[e] = vcf.Reader(
                        open_gzipsafe(filt_vcf_fpath, 'r'))
    return vcf_readers, filt_vcf_readers
def get_transcipts_with_exons_from_features(features_file, cur_chrom=None):
    transcripts = defaultdict(list)
    with open_gzipsafe(adjust_path(features_file)) as in_f:
        for line in in_f:
            if line.startswith('#'):
                continue
            fields = line.strip('\n').split('\t')

            chrom = fields[0]
            if cur_chrom and chrom != cur_chrom:
                continue

            feature_type = fields[6]
            if feature_type not in ['Exon', 'CDS', 'UTR']:
                continue

            start = int(fields[1])
            stop = int(fields[2])
            transcript_id = fields[8]

            exon = {
                'transcript_id': transcript_id,
                'chrom': chrom,
                'start': start,
                'stop': stop
            }
            transcripts[(transcript_id, chrom)].append(exon)
    return transcripts
def _get_gene_transcripts_id(cnf):
    genes_dict = dict()
    transcripts_dict = dict()

    if not cnf.genome.all_transcripts:
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.')
    if not verify_file(cnf.genome.all_transcripts):
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.')

    info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts)

    with open_gzipsafe(cnf.genome.all_transcripts) as f:
        for i, l in enumerate(f):
            if l.startswith('#'):
                continue
            chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t')
            if feature != 'transcript':
                continue
            try:
                _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:]))
                                  for t in props_line.split(';') if t.strip())
            except ValueError:
                sys.stderr.write(format_exc())
                sys.stderr.write(l)

            gene_symbol = _rm_quotes(_prop_dict['gene_name'])
            gene_id = _rm_quotes(_prop_dict['gene_id'])
            transcript_id = _rm_quotes(_prop_dict['transcript_id'])
            #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id)
            genes_dict[gene_id] = gene_symbol
            transcripts_dict[transcript_id] = gene_symbol
    return genes_dict, transcripts_dict
Пример #4
0
def main():
    dbsnp_fpath, out_fpath = get_args()

    info('-' * 70)
    info('Reading ' + dbsnp_fpath + ', writing to ' + out_fpath)

    with open_gzipsafe(dbsnp_fpath) as dbsnp, open(out_fpath, 'w') as out:
        for l in dbsnp:
            if l.startswith('#'):
                continue
            fs = l.replace('\n', '').split('\t')
            assert len(fs) == 8, str(fs)

            chrom, pos, rsid, ref, alt, _, _, inf = l.replace('\n',
                                                              '').split('\t')
            alts = alt.split(',')
            if len(alts) > 1:
                caf = next(
                    (kv.split('=')[1]
                     for kv in inf.split(';') if kv.split('=')[0] == 'CAF'),
                    None)
                if caf:
                    cafs = caf.replace('[', '').replace(']', '').split(',')[1:]
                    assert len(cafs) == len(alts), l
                    for alt, caf in zip(alts, cafs):
                        if caf != '.':
                            l = '\t'.join([chrom, pos, rsid, ref, alt, caf
                                           ]) + '\n'
                            out.write(l)

    info()
    info('Saved to ' + out_fpath)
def verify_vcf(vcf_fpath, silent=False, is_critical=False):
    if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical):
        return None
    debug('File ' + vcf_fpath + ' exists and not empty')
    vcf = open_gzipsafe(vcf_fpath)
    debug('File ' + vcf_fpath + ' opened')
    l = next(vcf, None)
    if l is None:
        (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath)
        return None
    if not l.startswith('##fileformat=VCF'):
        (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath)
        return None

    try:
        reader = vcf_parser.Reader(vcf)
    except:
        err('Error: cannot open the VCF file ' + vcf_fpath)
        if is_critical: raise
    else:
        debug('File ' + vcf_fpath + ' opened as VCF')
        try:
            rec = next(reader)
        except IndexError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('IndexError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except ValueError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('ValueError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except StopIteration:
            debug('No records in the VCF file ' + vcf_fpath)
            if not silent:
                warn('VCF file ' + vcf_fpath + ' has no records.')
            return vcf_fpath
        except:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('Other error parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        else:
            debug('A record was read from the VCF file ' + vcf_fpath)
            return vcf_fpath
        # f = open_gzipsafe(output_fpath)
        # l = f.readline()
        # if 'Cannot allocate memory' in l:
        #     f.close()
        #     f = open_gzipsafe(output_fpath)
        #     contents = f.read()
        #     if not silent:
        #         if is_critical:
        #             critical('SnpSift failed with memory issue:\n' + contents)
        #         else:
        #             err('SnpSift failed with memory issue:\n' + contents)
        #             return None
        #     f.close()
        #     return None
        # return output_fpath
    finally:
        vcf.close()
def vcf_is_empty(cnf, vcf_fpath):
    vcf = open_gzipsafe(vcf_fpath)
    reader = vcf_parser.Reader(vcf)
    result = True
    for rec in reader:
        result = False
    vcf.close()
    return result
def read_sample_names_from_vcf(vcf_fpath):
    f = open_gzipsafe(vcf_fpath)
    basic_fields = next(
        (l.strip()[1:].split() for l in f
        if l.strip().startswith('#CHROM')), None)
    if not basic_fields:
        critical('Error: no VCF header in ' + vcf_fpath)
    if len(basic_fields) < 9:
        return []
    return basic_fields[9:]
Пример #8
0
def run_fastq(cnf,
              sample_name,
              l_r_fpath,
              r_r_fpath,
              output_dirpath,
              downsample_to=1e7):
    fastqc = get_system_path(cnf, 'fastqc', is_critical=True)
    java = get_system_path(cnf, 'java', is_critical=True)

    if downsample_to:
        info('Downsampling to ' + str(downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample_name,
                                      l_r_fpath,
                                      r_r_fpath,
                                      downsample_to,
                                      output_dir=cnf.work_dir)

    # Joining fastq files to run on a combination
    fastqc_fpath = join(cnf.work_dir, sample_name + '.fq')
    info('Combining fastqs, writing to ' + fastqc_fpath)
    with open(fastqc_fpath, 'w') as out:
        out.write(open_gzipsafe(l_r_fpath).read())
        out.write(open_gzipsafe(r_r_fpath).read())

    # Running FastQC
    info('Running FastQC')
    tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp')
    safe_mkdir(tmp_dirpath)
    cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format(
        **locals())
    call(cnf, cmdline)

    # Cleaning and getting report
    sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc')
    if isfile(sample_fastqc_dirpath + '.zip'):
        os.remove(sample_fastqc_dirpath + '.zip')
    fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html')
    verify_file(fastqc_html_fpath, is_critical=True)

    return sample_fastqc_dirpath
def check_file_changed(cnf, new, in_work):
    if not file_exists(in_work):
        cnf['reuse_intermediate'] = False

    if cnf.get('reuse_intermediate'):
        if (basename(in_work) != basename(new) or
            md5_for_file(open(in_work, 'rb')) !=
            md5_for_file(open_gzipsafe(new, 'rb'))):

            info('Input file %s changed, setting "reuse_intermediate" '
                'to False.' % str(new))
            cnf['reuse_intermediate'] = False
def _get_qual_threshold(input_fpath):
    qual_threshold = None
    q_filter_regex = re.compile(r'##FILTER=<ID=q(\d+),Description="Mean Base Quality Below \d+">')
    with open_gzipsafe(input_fpath) as f:
        for l in f:
            if not l.startswith('##'):
                break
            m = q_filter_regex.match(l)
            if m:
                qual_threshold = int(m.group(1))
                break
    return qual_threshold
Пример #11
0
def _get_subs_and_indel_stats(vcf_fpath, chr_lengths, plot_scale):
    reader = vcf.Reader(open_gzipsafe(vcf_fpath, 'r'))

    variants_distribution = dict()
    for chr_name, chr_length in chr_lengths:
        variants_distribution[chr_name] = [0] * max(1, chr_length / plot_scale)
    variants_distribution['OTHER'] = 0

    substituitions = OrderedDict()
    nucleotides = ['A', 'C', 'G', 'T']

    def _add_nuc(nuc):
        substituitions[nuc] = OrderedDict()
        for nuc2 in nucleotides:
            if nuc != nuc2:
                substituitions[nuc][nuc2] = 0

    for nuc in nucleotides:
        _add_nuc(nuc)

    indel_lengths = []
    for rec in reader:
        # for variants distribution plot
        if rec.CHROM not in variants_distribution:
            variants_distribution['OTHER'] += 1
        else:
            region_id = min((rec.POS - 1) / plot_scale, len(variants_distribution[rec.CHROM]) - 1)
            variants_distribution[rec.CHROM][region_id] += 1
        # for substitution and indel plots
        for alt in rec.ALT:
            if rec.is_snp:
                if rec.REF not in substituitions:
                    nucleotides.append(rec.REF)
                    _add_nuc(rec.REF)
                if alt.sequence not in substituitions:
                    nucleotides.append(alt.sequence)
                    _add_nuc(alt.sequence)
                substituitions[rec.REF][str(alt)] += 1
            elif rec.is_indel:
                if alt is None:
                    indel_lengths.append(-1)
                else:
                    indel_lengths.append(len(alt) - len(rec.REF))

    # the last region in each chromosome is not exactly equal to plot_scale
    for chr_name, chr_length in chr_lengths:
        last_region_length = chr_length % plot_scale + (0 if chr_length < plot_scale else plot_scale)
        variants_distribution[chr_name][-1] = int(variants_distribution[chr_name][-1] * plot_scale /
                                                  float(last_region_length))
    return variants_distribution, substituitions, indel_lengths
def parse_variants(vcf_fpath, only_pass=True):
    variants_by_chrom = defaultdict(list)
    with open_gzipsafe(vcf_fpath) as vcf:
        for line in vcf:
            line = line.strip('\n')
            if line.startswith('##INFO=<ID=ANN'):
                ann_field_names = line.split('Format: ')[-1].strip('">').split('|')
                ann_field_names = [f.strip() for f in ann_field_names]
                ann_field_names[0] = ann_field_names[0].split('\'')[1]
            if line.startswith('#'):
                continue
            if only_pass and 'PASS' not in line:
                continue

            fields = line.split('\t')
            info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[7])])
            annotation_array = info_field['ANN'].split(',') if 'ANN' in info_field else []
            all_annotations = [dict(zip(ann_field_names, x.split('|'))) for x in annotation_array if len(ann_field_names) == len(x.split('|'))]
            coding_annotations = [ann for ann in all_annotations if ann['Feature_ID'].startswith('NM')]

            variant = dict()
            variant['chrom'] = fields[0]
            alt_alleles = fields[4].split(',')

            # different variant for each alt allele
            for i, alt_allele in enumerate(alt_alleles):
                annotations = [ann for ann in coding_annotations if (ann['Allele']) == alt_allele]

                variant['pos'], variant['ref'], variant['alt'] = get_minimal_representation(fields[1], fields[3], alt_allele)
                variant['transcripts'] = set()
                for annotation in annotations:
                    transcript = annotation['Feature_ID'].split('.')[0]
                    variant['transcripts'].add(transcript)

                variant['transcripts'] = list(variant['transcripts'])
                variants_by_chrom[variant['chrom']].append(variant)
    return variants_by_chrom
Пример #13
0
def main():
    if len(sys.argv) < 4:
        info(
            'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.'
        )
        # info('When the gene name is found in HGNC, it get replaced with an approved name.         ')
        # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is.  ')
        info(
            '                                                                                      '
        )
        info(
            'Usage:                                                                                '
        )
        info('    ' + __file__ +
             ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path +
             '] [additional_feature_list]')
        info(
            '                                                                                      '
        )
        info(
            '     where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:'
        )
        info(
            '     #Approved Symbol  Previous Symbols                    Synonyms                          Chromosome   Ensembl Gene ID   UCSC ID(supplied by UCSC)'
        )
        info(
            '     OR7E26P           OR7E67P, OR7E69P, OR7E70P, OR7E68P  OR1-51, OR1-72, OR1-73, OR912-95  19q13.43	    ENSG00000121410   uc002qsg.3'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz'
        )
        info(
            '     1  pseudogene            gene        11869  14412  .  +  .  gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";'
        )
        info(
            '     1  processed_transcript  transcript  11869  14409  .  +  .  gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz'
        )
        info(
            '     NC_000001.10    RefSeq          region       1       249250621       .       +       .       ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA'
        )
        info(
            '     NC_000001.10    BestRefSeq      gene         11874   14409           .       +       .       ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true'
        )
        info(
            '     NC_000001.10    BestRefSeq      transcript   11874   14409           .       +       .       ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     NC_000001.10    BestRefSeq      exon         11874   12227           .       +       .       ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:'
        )
        info(
            '     #hg19.knownGene.name  hg19.knownGene.chrom  hg19.knownGene.strand  hg19.knownGene.txStart  hg19.knownGene.txEnd  hg19.knownGene.exonCount  hg19.knownGene.exonStarts  hg19.knownGene.exonEnds  hg19.kgXref.geneSymbol'
        )
        info(
            '     uc001aaa.3	         chr1	               +	                  11873                   14409                 3                         11873,12612,13220,	      12227,12721,14409,	   DDX11L1'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     Writes to Exons.bed                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols'
        )
        sys.exit(1)

    genome_name = sys.argv[1]
    seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath
    canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath
    chr_lengths = get_chr_lengths_from_seq(seq_fpath)
    chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    input_fpath = verify_file(sys.argv[2])
    output_fpath = adjust_path(sys.argv[3])

    synonyms_fpath = None
    if len(sys.argv) > 4:
        synonyms_fpath = verify_file(sys.argv[4])
        info('Synonyms file provided ' + synonyms_fpath + '')
    else:
        info('No synonyms file provided, skipping approving')

    not_approved_fpath = None
    if len(sys.argv) > 5:
        not_approved_fpath = adjust_path(sys.argv[5])

    with open(verify_file(canonical_transcripts_fpath)) as f:
        canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f)

    info('Reading the features...')
    with open_gzipsafe(input_fpath) as inp:
        l = inp.readline()
        if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'):
            gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath,
                                                       chr_order)
        elif output_fpath.endswith('.gff3') or output_fpath.endswith(
                '.gff3.gz'):
            gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath,
                                                       chr_order)
        else:
            gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order)

    if synonyms_fpath and synonyms_fpath != "''":
        gene_by_name_and_chrom, not_approved_gene_names = _approve(
            gene_by_name_and_chrom, synonyms_fpath)

        info('')
        info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) +
             ' genes.')
        if not_approved_fpath:
            with open(not_approved_fpath, 'w') as f:
                f.write('#Searched as\tStatus\n')
                f.writelines((l + '\n' for l in not_approved_gene_names))
            info('Saved not approved to ' + not_approved_fpath)

        # with open('serialized_genes.txt', 'w') as f:
        #     for g in gene_by_name.values():
        #         f.write(str(g) + '\t' + str(g.db_id) + '\n')
        #         for e in g.exons:
        #             f.write('\t' + str(e) + '\n')

    info('Found:')
    info('  ' + str(len(gene_by_name_and_chrom)) + ' genes')

    genes = gene_by_name_and_chrom.values()

    coding_and_mirna_genes = [
        g for g in genes
        if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts)
    ]

    coding_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'protein_coding' for t in g.transcripts)
    ]
    coding_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'protein_coding'
    ]
    mirna_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA' for t in g.transcripts)
    ]
    mirna_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'miRNA'
    ]
    codingmiRNA_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA'
               for t in g.transcripts) and any(t.biotype == 'protein_coding'
                                               for t in g.transcripts)
    ]
    info('  ' + str(len(coding_genes)) + ' coding genes')
    info('  ' + str(len(coding_transcripts)) + ' coding transcripts')
    info('  ' + str(len(mirna_genes)) + ' miRNA genes')
    info('  ' + str(len(mirna_transcripts)) + ' miRNA transcripts')
    info('  ' + str(len(codingmiRNA_genes)) +
         ' genes with both coding and miRNA transcripts')

    info()
    # info('Choosing genes with exons...')
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]

    info('Choosing canonical...')
    canon_genes = choose_canonical(genes, canonical_transcripts_ids)

    info()
    info('Sorting and printing all regions...')
    print_genes(genes, output_fpath, canon_only=False)

    info()
    info('Sorting and printing canonical regions...')
    canon_output_fpath = add_suffix(output_fpath, 'canon')
    print_genes(canon_genes, canon_output_fpath, canon_only=True)

    info()
    info('Saved all regions to\n   ' + output_fpath + '\n   ' +
         canon_output_fpath)
Пример #14
0
def postprocess_vcf(cnf, work_dir, var_sample, caller_name, variants,
                    mutations, vcf2txt_res_fpath):
    if cnf is None:
        global glob_cnf
        cnf = glob_cnf

    info(var_sample.name + ((', ' + caller_name) if caller_name else '') +
         ': writing filtered VCFs')

    filter_values = set(variants.values())

    # Saving .anno.filt.vcf.gz and .anno.filt.pass.vcf
    ungz, gz = None, None
    if var_sample.filt_vcf_fpath.endswith('.gz'):
        ungz = splitext(var_sample.filt_vcf_fpath)[0]
        gz = var_sample.filt_vcf_fpath
    else:
        ungz = var_sample.filt_vcf_fpath
        gz = var_sample.filt_vcf_fpath + '.gz'
    if not var_sample.filt_tsv_fpath:
        var_sample.filt_tsv_fpath = splitext(ungz)[0] + '.tsv'

    if cnf.reuse_intermediate \
            and verify_file(var_sample.filt_vcf_fpath, silent=True) \
            and verify_file(var_sample.pass_filt_vcf_fpath, silent=True) \
            and verify_file(var_sample.filt_tsv_fpath, silent=True):
        info(var_sample.filt_vcf_fpath + ' and ' +
             var_sample.pass_filt_vcf_fpath + ' exist; reusing.')

    else:
        safe_mkdir(dirname(var_sample.filt_vcf_fpath))
        safe_mkdir(dirname(var_sample.pass_filt_vcf_fpath))

        with open_gzipsafe(var_sample.anno_vcf_fpath) as vcf_f, \
             file_transaction(work_dir, ungz) as filt_tx, \
             file_transaction(work_dir, var_sample.pass_filt_vcf_fpath) as pass_tx:
            with open(filt_tx, 'w') as filt_f, open(pass_tx, 'w') as pass_f:
                info(var_sample.name +
                     ((', ' + caller_name) if caller_name else '') +
                     ': opened ' + var_sample.anno_vcf_fpath +
                     ', writing to ' + ungz + ' and ' +
                     var_sample.pass_filt_vcf_fpath)

                for l in vcf_f:
                    if l.startswith('#'):
                        if l.startswith('#CHROM'):
                            filt_f.write(
                                '##FILTER=<ID=vcf2txt,Description="Hard-filtered by vcf2txt.pl">\n'
                            )
                            filt_f.write(
                                '##FILTER=<ID=vardict2mut,Description="Hard-filtered by vardict2mut.pl">\n'
                            )
                            for filt_val in filter_values:
                                if filt_val != 'PASS':
                                    filt_f.write('##FILTER=<ID=' + filt_val +
                                                 ',Description="">\n')
                        filt_f.write(l)
                        pass_f.write(l)
                    else:
                        ts = l.split('\t')
                        chrom, pos, alt = ts[0], ts[1], ts[4]
                        if (chrom, pos, alt) in mutations:
                            ts[6] = 'PASS'
                            filt_f.write('\t'.join(ts))
                            pass_f.write('\t'.join(ts))
                        else:
                            if ts[6] in ['', '.', 'PASS']:
                                ts[6] = ''
                                filter_value = variants.get((chrom, pos, alt))
                                if filter_value is None:
                                    ts[6] += 'vcf2txt'
                                elif filter_value == 'TRUE':
                                    ts[6] += 'vardict2mut'
                                else:
                                    ts[6] += filter_value
                            filt_f.write('\t'.join(ts))

        info(var_sample.name + ((', ' + caller_name) if caller_name else '') +
             ': saved filtered VCFs to ' + ungz + ' and ' +
             var_sample.pass_filt_vcf_fpath)

        if False:
            info()
            info(var_sample.name +
                 ((', ' + caller_name) if caller_name else '') +
                 ': writing filtered TSVs')
            # Converting to TSV - saving .anno.filt.tsv
            if 'tsv_fields' in cnf.annotation and cnf.tsv:
                tmp_tsv_fpath = make_tsv(cnf, ungz, var_sample.name)
                if not tmp_tsv_fpath:
                    err('TSV convertion didn\'t work')
                else:
                    if isfile(var_sample.filt_tsv_fpath):
                        os.remove(var_sample.filt_tsv_fpath)
                    shutil.copy(tmp_tsv_fpath, var_sample.filt_tsv_fpath)

                info(var_sample.name +
                     ((', ' + caller_name) if caller_name else '') +
                     ': saved filtered TSV to ' + var_sample.filt_tsv_fpath)

    info('Done postprocessing filtered VCF.')
    return ungz
Пример #15
0
def downsample(cnf,
               sample_name,
               fastq_L_fpath,
               fastq_R_fpath,
               N,
               output_dir,
               suffix=None,
               quick=False):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    quick=True will just grab the first N reads rather than do a true
    downsampling
    """
    sample_name = sample_name or splitext(''.join(
        lc if lc == rc else ''
        for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0]

    l_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_L_fpath), suffix or 'subset'))
    r_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_R_fpath), suffix or 'subset'))
    if cnf.reuse_intermediate and verify_file(
            l_out_fpath, silent=True) and verify_file(r_out_fpath,
                                                      silent=True):
        info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.')
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    N = int(N)
    records_num = N
    if quick:
        rand_records = range(N)
    else:
        info(sample_name + ': getting number of reads in fastq...')
        records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4
        if records_num > LIMIT:
            info(sample_name + ' the number of reads is higher than ' +
                 str(LIMIT) + ', sampling from only first ' + str(LIMIT))
            records_num = LIMIT
        info(sample_name + ': ' + str(records_num) + ' reads')
        if records_num < N:
            info(sample_name + ': and it is less than ' + str(N) +
                 ', so no downsampling.')
            return fastq_L_fpath, fastq_R_fpath
        else:
            info(sample_name + ': downsampling to ' + str(N))
            rand_records = sorted(random.sample(xrange(records_num), N))

    info('Opening ' + fastq_L_fpath)
    fh1 = open_gzipsafe(fastq_L_fpath)
    info('Opening ' + fastq_R_fpath)
    fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath)

    written_records = 0
    with file_transaction(cnf.work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, basestring):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4):
                    fh1.readline()
                if fh2:
                    for i in range(4):
                        fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            rec_no += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) +
                     ', rec_no ' + str(rec_no))
            if rec_no > records_num:
                info(sample_name + ' reached the limit of ' + str(records_num),
                     ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) +
             ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_R_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath +
         ' and ' + r_out_fpath + ', total ' + str(written_records) +
         ' paired reads written')
    return l_out_fpath, r_out_fpath
def read_samples_info_and_split(common_cnf, options, inputs):
    #TODO: _set_up_dirs(cnf) for each sample

    info('')
    info('Processing input details...')

    details = None
    for key in inputs:
        if options.get(key):
            common_cnf[key] = adjust_path(options[key])
            info('Using ' + common_cnf[key])
            details = [common_cnf]
    if not details:
        details = common_cnf.get('details')
    if not details:
        critical('Please, provide input ' + ', '.join(inputs) +
                 ' in command line or in run info yaml config.')

    all_samples = OrderedDict()

    for one_item_cnf in details:
        if 'vcf' not in one_item_cnf:
            critical('ERROR: A section in details does not contain field "var".')
        one_item_cnf['vcf'] = adjust_path(one_item_cnf['vcf'])
        verify_file(one_item_cnf['vcf'], 'Input file', is_critical=True)

        join_parent_conf(one_item_cnf, common_cnf)

        work_vcf = join(one_item_cnf['work_dir'], basename(one_item_cnf['vcf']))
        check_file_changed(one_item_cnf, one_item_cnf['vcf'], work_vcf)
        if not one_item_cnf.get('reuse_intermediate'):
            with open_gzipsafe(one_item_cnf['vcf']) as inp, open_gzipsafe(work_vcf, 'w') as out:
                out.write(inp.read())
        one_item_cnf['vcf'] = work_vcf

        vcf_header_samples = read_sample_names_from_vcf(one_item_cnf['vcf'])

        # MULTIPLE SAMPELS
        if ('samples' in one_item_cnf or one_item_cnf.get('split_samples')) and len(vcf_header_samples) == 0:
            sample_cnfs = _verify_sample_info(one_item_cnf, vcf_header_samples)

            for header_sample_name in vcf_header_samples:
                if header_sample_name not in sample_cnfs:
                    sample_cnfs[header_sample_name] = one_item_cnf.copy()

                if header_sample_name in all_samples:
                    critical('ERROR: duplicated sample name: ' + header_sample_name)

                cnf = all_samples[header_sample_name] = sample_cnfs[header_sample_name]
                cnf['name'] = header_sample_name
                if cnf.get('keep_intermediate'):
                    cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log')

                # cnf['vcf'] = extract_sample(cnf, one_item_cnf['vcf'], cnf['name'])
                info()

        # SINGLE SAMPLE
        else:
            cnf = one_item_cnf

            if 'bam' in cnf:
                cnf['bam'] = adjust_path(cnf['bam'])
                verify_bam(cnf['bam'], is_critical=True)

            cnf['name'] = splitext_plus(basename(cnf['vcf']))[0]

            if cnf.get('keep_intermediate'):
                cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log')

            cnf['vcf'] = work_vcf
            all_samples[cnf['name']] = cnf

    if not all_samples:
        info('No samples.')
    else:
        info('Using samples: ' + ', '.join(all_samples) + '.')

    return all_samples
Пример #17
0
def make_report(cnf, vcf_fpath, sample):
    set_db_versions(cnf)
    step_greetings('Quality control reports')

    total_with_rejected = 0
    total = 0
    snps = 0
    inss = 0
    dels = 0
    dbsnps = 0
    cosmics = 0
    novels = 0
    hets = 0
    homs = 0
    transitions = 0
    transversions = 0

    with open_gzipsafe(vcf_fpath) as f:
        reader = vcf_parser.Reader(f)
        for rec in (vcf_processing.Record(rec, vcf_fpath, i)
                    for i, rec in enumerate(reader)):
            total_with_rejected += 1

            if not rec.FILTER or rec.FILTER == 'PASS':
                if rec.FILTER:
                    warn('Warn: ' + rec.get_variant() + ' FILTER=' +
                         str(rec.FILTER))

                total += 1

                if rec.is_snp:
                    snps += 1
                    if rec.is_transition:
                        transitions += 1
                    elif len(rec.ALT) == 1:
                        transversions += 1
                elif rec.is_indel:
                    if rec.is_deletion:
                        dels += 1
                    elif len(rec.ALT) == 1:
                        inss += 1

                if not rec.ID:
                    novels += 1
                else:
                    ids = rec.ID
                    if isinstance(ids, basestring):
                        ids = [ids]
                    if any(id.startswith('COS') for id in ids):
                        cosmics += 1
                    if any(id.startswith('rs') for id in ids):
                        dbsnps += 1

                call = rec.samples[0]
                if call.called:
                    if call.gt_type == 1:
                        hets += 1
                    elif call.gt_type == 2:
                        homs += 1

    report = SampleReport(sample, metric_storage=metric_storage)
    report.add_record('Total variants', total)
    report.add_record('SNPs', snps)
    report.add_record('Insertions', inss)
    report.add_record('Deletions', dels)
    report.add_record('Novel', novels)
    report.add_record('Novel, %', 1.0 * novels / total if total else None)
    report.add_record('In dbSNP', dbsnps)
    report.add_record('In dbSNP, %', 1.0 * dbsnps / total if total else None)
    report.add_record('In Cosmic', cosmics)
    report.add_record('In Cosmic, %', 1.0 * cosmics / total if total else None)
    report.add_record('Het/hom', float(hets) / homs if homs != 0 else None)
    report.add_record(
        'Ti/tv',
        float(transitions) / transversions if transversions != 0 else None)
    report.add_record('Total with rejected', total_with_rejected)

    return report
def convert_vardict_txts_to_bcbio_vcfs(cnf,
                                       bs,
                                       sample,
                                       output_dir=None,
                                       pass_only=False):
    info('')
    info('Preparing data for ' + sample.name)
    anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name)
    if not anno_filt_vcf_fpath:
        return None, None

    if not output_dir:
        output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath)
    output_vcf_fpath = join(
        output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending)
    pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass')
    if cnf.reuse_intermediate and verify_vcf(
            output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath +
                                                     '.gz'):
        info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath +
             '.gz exists, reusing')
        return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz'

    info('Parsing PASS and REJECT mutations...')
    pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts(
        cnf, bs, sample, pass_only=pass_only)
    sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict)

    info('')
    info('Writing VCFs')
    vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r'))
    vcf_reader = add_keys_to_header(vcf_reader, filter_values)
    with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \
        file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx:
        vcf_writer = None
        if not pass_only:
            vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader)
        vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader)
        for key, mut in sorted_mut_dict.items():
            record = get_record_from_vcf(vcf_reader, mut)
            if record:
                if key in pass_mut_dict:
                    record.FILTER = ['PASS']
                    if mut.reason:
                        record.INFO['Reason'] = mut.reason.replace(' ', '_')
                elif pass_only:
                    continue
                elif key in reject_mut_dict:
                    if not mut.reason:
                        continue
                    reject_reason_ids = [
                        filter_descriptions_dict[reason]
                        if reason in filter_descriptions_dict else reason
                        for reason in mut.reason.split(' and ')
                    ]
                    record.FILTER = [';'.join(reject_reason_ids)]
                if mut.signif:
                    record.INFO['Signif'] = mut.signif
                if mut.status:
                    record.INFO['Status'] = mut.status
                if vcf_writer:
                    vcf_writer.write_record(record)
                if key in pass_mut_dict:
                    vcf_pass_writer.write_record(record)
            else:
                warn('No record was found in ' + anno_filt_vcf_fpath +
                     ' for mutation ' + str(mut))

    output_gzipped_vcf_fpath = None
    if vcf_writer:
        vcf_writer.close()
        output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath)
        info('VCF file for vardict.txt is saved to ' +
             output_gzipped_vcf_fpath)
    vcf_pass_writer.close()
    output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath)
    info('VCF file for vardict.PASS.txt is saved to ' +
         output_gzipped_pass_vcf_fpath)
    return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath