示例#1
0
 def __init__(self, args):
     gq = GeminiQuery(args.db)
     subjects = get_subjects(args)
     # get samples in order of genotypes
     self.samples = [
         gq.idx_to_sample_object[x] for x in range(len(subjects))
     ]
示例#2
0
def get_fusions(args):
    """
    Identify candidate rearrangments resulting in fusion genes.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)

    # create strings for gemini query of command line args
    qual_string, ev_type_string, cosmic_string = ("", "", "")
    if args.min_qual:
        qual_string = " AND qual >= %s" % args.min_qual
    if args.evidence_type:
        ev_type_string = " AND sv_evidence_type = '%s'" % args.evidence_type

    query = """SELECT variants.chrom, start, end,
                      ref, alt,
                      qual,
                      is_somatic, somatic_score,
                      type, sub_type, variants.gene, 
                      sv_strand, sv_length,
                      sv_cipos_start_left,
                      sv_cipos_start_right,
                      sv_cipos_end_left,
                      sv_cipos_end_right,
                      sv_event_id, sv_mate_id,
                      sv_tool, sv_evidence_type,
                      sv_is_precise,
                      gene_summary.strand,
                      gene_summary.transcript_min_start,
                      gene_summary.transcript_max_end,
                      gene_summary.in_cosmic_census
               FROM variants, gene_summary
               WHERE is_somatic = 1   
               AND   type = 'sv'
               AND   variants.gene is not NULL
               AND   variants.chrom = gene_summary.chrom
               AND   variants.gene = gene_summary.gene
               %s
               %s
               ORDER BY sv_event_id
            """ % (qual_string, ev_type_string)

    curr = None
    prev = None
    gq.run(query)
    for row in gq:
        # single-line variants (DEL, DUP, INV)
        if row['sub_type'] != 'complex':
            report_fusion([row], subjects_dict, args)

        # multi-line variants (BND)
        elif row['sv_mate_id']:
            curr = row
            # the SV event ids match, and prev is not None
            if (prev and curr['sv_event_id'] == prev['sv_event_id']):
                report_fusion([prev, curr], subjects_dict, args)
            # shift the previous
            prev = curr
示例#3
0
def get_fusions(args):
    """
    Identify candidate rearrangments resulting in fusion genes.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)

    # create strings for gemini query of command line args
    qual_string, ev_type_string, cosmic_string = ("", "", "")
    if args.min_qual:
        qual_string = " AND qual >= %s" % args.min_qual
    if args.evidence_type:
        ev_type_string = " AND sv_evidence_type = '%s'" % args.evidence_type

    query = """SELECT variants.chrom, start, end,
                      ref, alt,
                      qual,
                      is_somatic, somatic_score,
                      type, sub_type, variants.gene, 
                      sv_strand, sv_length,
                      sv_cipos_start_left,
                      sv_cipos_start_right,
                      sv_cipos_end_left,
                      sv_cipos_end_right,
                      sv_event_id, sv_mate_id,
                      sv_tool, sv_evidence_type,
                      sv_is_precise,
                      gene_summary.strand,
                      gene_summary.transcript_min_start,
                      gene_summary.transcript_max_end,
                      gene_summary.in_cosmic_census
               FROM variants, gene_summary
               WHERE is_somatic = 1   
               AND   type = 'sv'
               AND   variants.gene is not NULL
               AND   variants.chrom = gene_summary.chrom
               AND   variants.gene = gene_summary.gene
               %s
               %s
               ORDER BY sv_event_id
            """ % (qual_string, ev_type_string)

    curr = None
    prev = None
    gq.run(query)
    for row in gq:
        # single-line variants (DEL, DUP, INV)
        if row['sub_type'] != 'complex':
            report_fusion([row], subjects_dict, args)

        # multi-line variants (BND)
        elif row['sv_mate_id']:
            curr = row
            # the SV event ids match, and prev is not None
            if (prev and curr['sv_event_id'] == prev['sv_event_id']):
                report_fusion([prev, curr], subjects_dict, args)
            # shift the previous
            prev = curr
示例#4
0
def run_query(args):
    predicates = get_row_predicates(args)
    add_required_columns_to_query(args)
    formatter = select_formatter(args)
    genotypes_needed = needs_genotypes(args)
    gene_needed = needs_gene(args)
    try:
        subjects = get_subjects(args)
    except KeyError:
        subjects = []
    kwargs = {}
    if args.bcolz:
        import gemini_bcolz
        kwargs['variant_id_getter'] = gemini_bcolz.filter

    gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter, **kwargs)
    gq.run(args.query,
           args.gt_filter,
           args.show_variant_samples,
           args.sample_delim,
           predicates,
           genotypes_needed,
           gene_needed,
           args.show_families,
           subjects=subjects)

    if args.use_header and gq.header:
        print gq.header

    if not args.dgidb:
        for row in gq:
            print row
    else:
        # collect a list of all the genes that need to be queried
        # from DGIdb
        genes = defaultdict()
        for row in gq:
            genes[row['gene']] = True

        # collect info from DGIdb
        dgidb_info = query_dgidb(genes)

        # rerun the query (the cursor is now consumed)
        gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)
        gq.run(args.query,
               args.gt_filter,
               args.show_variant_samples,
               args.sample_delim,
               predicates,
               genotypes_needed,
               gene_needed,
               args.show_families,
               subjects=subjects,
               **kwargs)

        # report the query results with DGIdb info added at the end.
        for row in gq:
            print str(row) + "\t" + str(dgidb_info[row['gene']])
示例#5
0
def get_compound_hets(args):
    """
    Report candidate compound heterozygotes.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)

    # run the query applying any genotype filters provided by the user.
    gq.run(create_query(args))

    sample_hets = collections.defaultdict(
        lambda: collections.defaultdict(list))
    curr_gene = None
    prev_gene = None
    comp_het_counter = 0
    # output header
    print "family\tsample\tcomp_het_id\t" + str(gq.header)
    # Collect all of the genic heterozygotes for each sample / gene
    for row in gq:

        gt_types = row['gt_types']
        gt_bases = row['gts']
        gt_phases = row['gt_phases']
        curr_gene = row['gene']

        # gene has changed. process the comp_hets for this gene and reset.
        if curr_gene != prev_gene and prev_gene is not None:
            # process comp_hets
            samples_w_hetpair = find_valid_het_pairs(args, sample_hets)
            comp_het_counter = filter_candidates(args, samples_w_hetpair,
                                                 subjects_dict,
                                                 comp_het_counter)
            # reset for next gene
            sample_hets = collections.defaultdict(
                lambda: collections.defaultdict(list))

        site = Site(row)
        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates for this sample/gene
                sample_hets[sample][site.row['gene']].append(sample_site)
        prev_gene = curr_gene

    # process the last gene seen
    samples_w_hetpair = find_valid_het_pairs(args, sample_hets)
    comp_het_counter = filter_candidates(args, samples_w_hetpair,
                                         subjects_dict, comp_het_counter)
示例#6
0
    def __init__(self, args):
        subjects = get_subjects(args)
        self.carrier_summary = args.carrier_summary

        self.column_types = list(set([getattr(x, self.carrier_summary)
                                      for x in subjects.values()]))
        self.column_counters = {None: set()}
        for ct in self.column_types:
            self.column_counters[ct] = set([k for (k, v) in subjects.items() if
                                            getattr(v, self.carrier_summary) == ct])
示例#7
0
def get_compound_hets(args):
    """
    Report candidate compound heterozygotes.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)
    
    # run the query applying any genotype filters provided by the user.
    gq.run(create_query(args))

    sample_hets = collections.defaultdict(lambda: collections.defaultdict(list))
    curr_gene = None
    prev_gene = None
    comp_het_counter = 0
    # output header
    print "family\tsample\tcomp_het_id\t" + str(gq.header)
    # Collect all of the genic heterozygotes for each sample / gene
    for row in gq:

        gt_types = row['gt_types']
        gt_bases = row['gts']
        gt_phases = row['gt_phases']
        curr_gene = row['gene']
        
        # gene has changed. process the comp_hets for this gene and reset.
        if curr_gene != prev_gene and prev_gene is not None:
            # process comp_hets
            samples_w_hetpair = find_valid_het_pairs(args, sample_hets)
            comp_het_counter = filter_candidates(args, samples_w_hetpair, 
                subjects_dict, comp_het_counter) 
            # reset for next gene
            sample_hets = collections.defaultdict(lambda: collections.defaultdict(list))
       
        site = Site(row)
        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates for this sample/gene
                sample_hets[sample][site.row['gene']].append(sample_site)
        prev_gene = curr_gene

    # process the last gene seen
    samples_w_hetpair = find_valid_het_pairs(args, sample_hets)
    comp_het_counter = filter_candidates(args, samples_w_hetpair, 
                subjects_dict, comp_het_counter) 
示例#8
0
def amend_sample(args):
    loaded_subjects = get_subjects(args)
    ped_dict = load_ped_file(args.sample)
    header = get_ped_fields(args.sample)
    with database_transaction(args.db) as c:
        for k, v in loaded_subjects.items():
            if k in ped_dict:
                item_list = map(quote_string, ped_dict[k])
                sample = zip(header, item_list)
                set_str = ",".join([str(x) + "=" + str(y) for (x, y) in sample])
                sql_query = "update samples set {0} where sample_id={1}"
                c.execute(sql_query.format(set_str, v.sample_id))
示例#9
0
    def __init__(self, args):
        subjects = get_subjects(args)
        self.carrier_summary = args.carrier_summary

        # get the list of all possible values in the column
        # but don't include None, since we are treating that as unknown.
        self.column_types = list(set([getattr(x, self.carrier_summary)
                                      for x in subjects.values()]))
        self.column_types = [i for i in self.column_types if i is not None]
        self.column_counters = {None: set()}
        for ct in self.column_types:
            self.column_counters[ct] = set([k for (k, v) in subjects.items() if
                                            getattr(v, self.carrier_summary) == ct])
示例#10
0
    def __init__(self, args):
        subjects = get_subjects(args)
        self.carrier_summary = args.carrier_summary

        # get the list of all possible values in the column
        # but don't include None, since we are treating that as unknown.
        self.column_types = list(set([getattr(x, self.carrier_summary)
                                      for x in subjects.values()]))
        self.column_types = [i for i in self.column_types if i is not None]
        self.column_counters = {None: set()}
        for ct in self.column_types:
            self.column_counters[ct] = set([k for (k, v) in subjects.items() if
                                            getattr(v, self.carrier_summary) == ct])
示例#11
0
def amend_sample(args):
    loaded_subjects = get_subjects(args)
    ped_dict = load_ped_file(args.sample)
    header = get_ped_fields(args.sample)
    with database_transaction(args.db) as c:
        for k, v in loaded_subjects.items():
            if k in ped_dict:
                item_list = map(quote_string, ped_dict[k])
                sample = zip(header, item_list)
                set_str = ",".join(
                    [str(x) + "=" + str(y) for (x, y) in sample])
                sql_query = "update samples set {0} where sample_id={1}"
                c.execute(sql_query.format(set_str, v.sample_id))
示例#12
0
    def get_compound_hets(self):
        """
        Report candidate compound heterozygotes.
        """
        args = self.args
        gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
        idx_to_sample = gq.idx_to_sample
        self.subjects_dict = subjects.get_subjects(args)

        # run the query applying any genotype filters provided by the user.
        gq.run(self.create_query())

        families = subjects.get_families(args.db, args.families)
        family_gt_labels, family_gt_cols = {}, {}
        for family in families:
            family_gt_labels[family.family_id] = family.get_genotype_labels()
            family_gt_cols[family.family_id] = family.get_genotype_columns()

        # output header
        print self.get_header(gq.header, is_comp_het=True)

        # Collect all of the genic heterozygotes for each sample / gene
        for gene, row_list in groupby(gq, itemgetter("gene")):
            sample_hets = collections.defaultdict(lambda: collections.defaultdict(list))
            for row in row_list:

                gt_types, gt_bases, gt_phases = row["gt_types"], row["gts"], row["gt_phases"]
                site = Site(row)
                # track each sample that is heteroyzgous at this site.
                for idx, gt_type in enumerate(gt_types):
                    if gt_type != HET:
                        continue
                    sample = idx_to_sample[idx]
                    sample_site = copy(site)
                    sample_site.phased = gt_phases[idx]

                    if not sample_site.phased and not args.ignore_phasing:
                        continue

                    sample_site.gt = gt_bases[idx]
                    # add the site to the list of candidates for this sample/gene
                    sample_hets[sample][site.row["gene"]].append(sample_site)

            # process the last gene seen
            samples_w_hetpair = self.find_valid_het_pairs(sample_hets)
            self.filter_candidates(samples_w_hetpair, family_gt_labels, family_gt_cols)
示例#13
0
def run_query(args):
    predicates = get_row_predicates(args)
    add_required_columns_to_query(args)
    formatter = select_formatter(args)
    genotypes_needed = needs_genotypes(args)
    gene_needed = needs_gene(args)
    try:
        subjects = get_subjects(args)
    except KeyError:
        subjects = []
    kwargs = {}
    if args.bcolz:
        import gemini_bcolz
        kwargs['variant_id_getter'] = gemini_bcolz.filter

    gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter, **kwargs)
    gq.run(args.query, args.gt_filter, args.show_variant_samples,
           args.sample_delim, predicates, genotypes_needed,
           gene_needed, args.show_families, subjects=subjects)

    if args.use_header and gq.header:
        print gq.header

    if not args.dgidb:
        for row in gq:
            print row
    else:
        # collect a list of all the genes that need to be queried
        # from DGIdb
        genes = defaultdict()
        for row in gq:
            genes[row['gene']] = True

        # collect info from DGIdb
        dgidb_info = query_dgidb(genes)

        # rerun the query (the cursor is now consumed)
        gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)
        gq.run(args.query, args.gt_filter, args.show_variant_samples,
               args.sample_delim, predicates, genotypes_needed,
               gene_needed, args.show_families, subjects=subjects, **kwargs)

        # report the query results with DGIdb info added at the end.
        for row in gq:
            print str(row) + "\t" + str(dgidb_info[row['gene']])
示例#14
0
def get_fusions(args):
    """
    Identify candidate rearrangments resulting in fusion genes.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)

    ##### COLBY, change "WHERE is_somatic is NULL"  to "WHERE is_somatic = 1" 
    query = """SELECT chrom, start, end, 
                      is_somatic, somatic_score,
                      type, sub_type, gene, 
                      sv_strand, sv_length,
                      sv_event_id, sv_mate_id,
                      sv_tool, sv_evidence_type
               FROM variants
               WHERE is_somatic is NULL    
               AND   sv_mate_id is not NULL
               AND   type = 'sv'
               AND   sub_type = 'complex'
               AND   gene is not NULL
               ORDER BY sv_event_id
            """

    curr = None
    prev = None
    events = []
    gq.run(query)
    for row in gq:
        curr = row['sv_event_id']
        # the SV event id changed.
        if curr != prev and prev is not None:
            # did both ends of the sv meet all the query criteria
            # and are both ends on the same strand?
            if len(events) == 2 and \
                (events[0]['sv_strand'] == events[1]['sv_strand']):
                report_fusion(events)
            # we are done with this candidate
            events = []   
        else:
            events.append(row)
        prev = curr
示例#15
0
def all_samples_predicate(args):
    """ returns a predicate that returns True if, for a variant,
    the only samples that have the variant have a given phenotype
    """
    subjects = get_subjects(args).values()
    return select_subjects_predicate(subjects, args)
示例#16
0
def get_compound_hets(args):
    """
    Report candidate compound heterozygous mutations.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)
    
    if args.columns is not None:
        custom_columns = _add_necessary_columns(args, str(args.columns))        
        query = "SELECT " + custom_columns + \
                " FROM variants " + \
                " WHERE (is_exonic = 1 or impact_severity != 'LOW') "
    else:
        # report the kitchen sink
        query = "SELECT *" + \
                ", gts, gt_types, gt_phases, gt_depths, \
                gt_ref_depths, gt_alt_depths, gt_quals" + \
                " FROM variants " + \
                " WHERE (is_exonic = 1 or impact_severity != 'LOW') "

    # add any non-genotype column limits to the where clause
    if args.filter:
        query += " AND " + args.filter

    # run the query applying any genotype filters provided by the user.
    gq.run(query)

    comp_hets = collections.defaultdict(lambda: collections.defaultdict(list))

    for row in gq:
        gt_types = row['gt_types']
        gts = row['gts']
        gt_bases = row['gts']
        gt_phases = row['gt_phases']
        
        site = Site(row)

        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                
                if args.only_affected and not subjects_dict[sample].affected:
                    continue

                # sample = "NA19002"
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                # require phased genotypes
                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates
                # for this sample/gene
                comp_hets[sample][site.row['gene']].append(sample_site)

    # header
    print "family\tsample\tcomp_het_id\t" + str(gq.header)

    # step 2.  now, cull the list of candidate heterozygotes for each
    # gene/sample to those het pairs where the alternate alleles
    # were inherited on opposite haplotypes.    
    comp_het_id = 1
    for sample in comp_hets:
        for gene in comp_hets[sample]:

            # we only care about combinations, not permutations
            # (e.g. only need site1,site2, not site1,site2 _and site2,site1)
            # thus we can do this in a ~ linear pass instead of a ~ N^2 pass
            for idx, site1 in enumerate(comp_hets[sample][gene]):
                for site2 in comp_hets[sample][gene][idx + 1:]:

                    # expand the genotypes for this sample
                    # at each site into it's composite
                    # alleles.  e.g. A|G -> ['A', 'G']
                    alleles_site1 = []
                    alleles_site2 = []
                    if not args.ignore_phasing:
                        alleles_site1 = site1.gt.split('|')
                        alleles_site2 = site2.gt.split('|')
                    else:
                        # split on phased (|) or unphased (/) genotypes
                        alleles_site1 = re.split('\||/', site1.gt)
                        alleles_site2 = re.split('\||/', site2.gt)
                    
                    # it is only a true compound heterozygote IFF
                    # the alternates are on opposite haplotypes.
                    if not args.ignore_phasing:
                        # return the haplotype on which the alternate
                        # allele was observed for this sample at each
                        # candidate het. site.
                        # e.g., if ALT=G and alleles_site1=['A', 'G']
                        # then alt_hap_1 = 1.  if ALT=A, then alt_hap_1 = 0
                        if "," in str(site1.row['alt']) or \
                           "," in str(site2.row['alt']):
                            sys.stderr.write("WARNING: Skipping candidate for sample"
                                             " %s b/c variants with mult. alt."
                                             " alleles are not yet supported. The sites are:"
                                             " %s and %s.\n" % (sample, site1, site2))
                            continue

                        alt_hap_1 = alleles_site1.index(site1.row['alt'])
                        alt_hap_2 = alleles_site2.index(site2.row['alt'])

                    # report if 
                    #   1. phasing is considered AND the alt alleles are on
                    #      different haplotypes
                    # OR
                    #   2. the user doesn't care about phasing.
                    if (not args.ignore_phasing and alt_hap_1 != alt_hap_2) \
                        or args.ignore_phasing:
                            print \
                               "\t".join([str(subjects_dict[sample].family_id), 
                                          sample,
                                         str(comp_het_id),
                                         str(site1.row)])
                            print \
                               "\t".join([str(subjects_dict[sample].family_id), 
                                          sample,
                                          str(comp_het_id),
                                          str(site2.row)])

                    comp_het_id += 1
示例#17
0
 def __init__(self, args):
     gq = GeminiQuery(args.db)
     subjects = get_subjects(args)
     # get samples in order of genotypes
     self.samples = [gq.idx_to_sample_object[x] for x in range(len(subjects))]
示例#18
0
def all_samples_predicate(args):
    """ returns a predicate that returns True if, for a variant,
    the only samples that have the variant have a given phenotype
    """
    subjects = get_subjects(args).values()
    return select_subjects_predicate(subjects, args)
示例#19
0
def get_compound_hets(args):
    """
    Report candidate compound heterozygous mutations.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(gq.c)
    
    if args.columns is not None:
        custom_columns = _add_necessary_columns(args, str(args.columns))        
        query = "SELECT " + custom_columns + \
                " FROM variants " + \
                " WHERE (is_exonic = 1 or impact_severity != 'LOW') "
    else:
        # report the kitchen sink
        query = "SELECT *" + \
                ", gts, gt_types, gt_phases, gt_depths, \
                gt_ref_depths, gt_alt_depths, gt_quals" + \
                " FROM variants " + \
                " WHERE (is_exonic = 1 or impact_severity != 'LOW') "

    # add any non-genotype column limits to the where clause
    if args.filter:
        query += " AND " + args.filter

    # run the query applying any genotype filters provided by the user.
    gq.run(query)

    comp_hets = collections.defaultdict(lambda: collections.defaultdict(list))

    for row in gq:
        gt_types = row['gt_types']
        gts = row['gts']
        gt_bases = row['gts']
        gt_phases = row['gt_phases']
        
        site = Site(row)

        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                
                if args.only_affected and not subjects_dict[sample].affected:
                    continue

                # sample = "NA19002"
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                # require phased genotypes
                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates
                # for this sample/gene
                comp_hets[sample][site.row['gene']].append(sample_site)

    # header
    print "family\tsample\tcomp_het_id\t" + str(gq.header)
    # step 2.  now, cull the list of candidate heterozygotes for each
    # gene/sample to those het pairs where the alternate alleles
    # were inherited on opposite haplotypes.
    
    comp_het_id = 1
    for sample in comp_hets:
        # track which comp_hets we have seen so far for this sample.
        #seen = {}
        for gene in comp_hets[sample]:
            for site1 in comp_hets[sample][gene]:
                for site2 in comp_hets[sample][gene]:
                    if site1 == site2:
                        continue
                    
                    #if (site1, site2) in seen or (site2, site1) in seen:
                    #    continue
                    
                    # avoid reporting the same comp_het, yet just in the
                    # opposition order.
                    #seen[(site1, site2)] = True
                    #seen[(site2, site1)] = True

                    # expand the genotypes for this sample
                    # at each site into it's composite
                    # alleles.  e.g. A|G -> ['A', 'G']
                    alleles_site1 = []
                    alleles_site2 = []
                    if not args.ignore_phasing:
                        alleles_site1 = site1.gt.split('|')
                        alleles_site2 = site2.gt.split('|')
                    else:
                        # split on phased (|) or unphased (/) genotypes
                        alleles_site1 = re.split('\||/', site1.gt)
                        alleles_site2 = re.split('\||/', site2.gt)

                    # return the haplotype on which the alternate
                    # allele was observed for this sample at each
                    # candidate het. site.
                    # e.g., if ALT=G and alleles_site1=['A', 'G']
                    # then alt_hap_1 = 1.  if ALT=A, then alt_hap_1 = 0
                    alt_hap_1 = alleles_site1.index(site1.row['alt'])
                    alt_hap_2 = alleles_site2.index(site2.row['alt'])
                    
                    # it is only a true compound heterozygote IFF
                    # the alternates are on opposite haplotypes.
                    if (not args.ignore_phasing and alt_hap_1 != alt_hap_2) \
                        or args.ignore_phasing:
                            print \
                               "\t".join([str(subjects_dict[sample].family_id), 
                                          sample,
                                         str(comp_het_id),
                                         str(site1.row)])
                            print \
                               "\t".join([str(subjects_dict[sample].family_id), 
                                          sample,
                                          str(comp_het_id),
                                          str(site2.row)])

                    comp_het_id += 1
示例#20
0
def get_compound_hets(args):
    """
    Report candidate compound heterozygous mutations.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(gq.c)
    
    if args.columns is not None:
        custom_columns = _add_necessary_columns(args, str(args.columns))        
        query = "SELECT " + custom_columns + \
                " FROM variants " + \
                " WHERE (is_exonic = 1 or impact_severity != 'LOW') "
    else:
        # report the kitchen sink
        query = "SELECT *" + \
                ", gts, gt_types, gt_phases, gt_depths, \
                gt_ref_depths, gt_alt_depths, gt_quals" + \
                " FROM variants " + \
                " WHERE (is_exonic = 1 or impact_severity != 'LOW') "

    # add any non-genotype column limits to the where clause
    if args.filter:
        query += " AND " + args.filter

    # run the query applying any genotype filters provided by the user.
    gq.run(query)

    comp_hets = collections.defaultdict(lambda: collections.defaultdict(list))

    for row in gq:
        gt_types = row['gt_types']
        gts = row['gts']
        gt_bases = row['gts']
        gt_phases = row['gt_phases']
        
        site = Site(row)

        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                
                if args.only_affected and not subjects_dict[sample].affected:
                    continue

                # sample = "NA19002"
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                # require phased genotypes
                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates
                # for this sample/gene
                comp_hets[sample][site.row['gene']].append(sample_site)

    # header
    print "family\tsample\tcomp_het_id\t" + str(gq.header)

    # step 2.  now, cull the list of candidate heterozygotes for each
    # gene/sample to those het pairs where the alternate alleles
    # were inherited on opposite haplotypes.    
    comp_het_id = 1
    for sample in comp_hets:
        for gene in comp_hets[sample]:

            # we only care about combinations, not permutations
            # (e.g. only need site1,site2, not site1,site2 _and site2,site1)
            # thus we can do this in a ~ linear pass instead of a ~ N^2 pass
            for idx, site1 in enumerate(comp_hets[sample][gene]):
                for site2 in comp_hets[sample][gene][idx + 1:]:

                    # expand the genotypes for this sample
                    # at each site into it's composite
                    # alleles.  e.g. A|G -> ['A', 'G']
                    alleles_site1 = []
                    alleles_site2 = []
                    if not args.ignore_phasing:
                        alleles_site1 = site1.gt.split('|')
                        alleles_site2 = site2.gt.split('|')
                    else:
                        # split on phased (|) or unphased (/) genotypes
                        alleles_site1 = re.split('\||/', site1.gt)
                        alleles_site2 = re.split('\||/', site2.gt)
                    
                    # it is only a true compound heterozygote IFF
                    # the alternates are on opposite haplotypes.
                    if not args.ignore_phasing:
                        # return the haplotype on which the alternate
                        # allele was observed for this sample at each
                        # candidate het. site.
                        # e.g., if ALT=G and alleles_site1=['A', 'G']
                        # then alt_hap_1 = 1.  if ALT=A, then alt_hap_1 = 0
                        if "," in str(site1.row['alt']) or \
                           "," in str(site2.row['alt']):
                            sys.stderr.write("WARNING: Skipping candidate for sample"
                                             " %s b/c variants with mult. alt."
                                             " alleles are not yet supported. The sites are:"
                                             " %s and %s.\n" % (sample, site1, site2))
                            continue

                        alt_hap_1 = alleles_site1.index(site1.row['alt'])
                        alt_hap_2 = alleles_site2.index(site2.row['alt'])

                    # report if 
                    #   1. phasing is considered AND the alt alleles are on
                    #      different haplotypes
                    # OR
                    #   2. the user doesn't care about phasing.
                    if (not args.ignore_phasing and alt_hap_1 != alt_hap_2) \
                        or args.ignore_phasing:
                            print \
                               "\t".join([str(subjects_dict[sample].family_id), 
                                          sample,
                                         str(comp_het_id),
                                         str(site1.row)])
                            print \
                               "\t".join([str(subjects_dict[sample].family_id), 
                                          sample,
                                          str(comp_het_id),
                                          str(site2.row)])

                    comp_het_id += 1