Пример #1
0
def MatchVcfPed(vcfs, peds):
    res = []
    for vcf in vcfs:
        for ped in peds:
            if utils.GetBaseName(vcf) == utils.GetBaseName(ped):
                print utils.GetBaseName(vcf), utils.GetBaseName(ped)
                res.append((vcf, ped))
                break
    print "Finded vcf-ped pairs:"
    for v, p in res:
        print v, p
    print '\n'
    return res
Пример #2
0
def OutputSHMsForVGenes(shm_df, output_config):
    gene_type_dict = dict()
    gene_len = dict()
    num_aligned = dict()
    for it in shm_df:
        if it.segment not in gene_type_dict:
            gene_type_dict[it.segment] = dict()
        segment_dict = gene_type_dict[it.segment]
        gene_name = utils.GetBaseName(it.gene_name)
        if gene_name not in gene_type_dict[it.segment]:
            gene_type_dict[it.segment][gene_name] = []
            gene_len[gene_name] = 0
            num_aligned[gene_name] = 0
        gene_type_dict[it.segment][gene_name].extend(shm_df[it])
        gene_len[gene_name] = max(gene_len[gene_name], it.gene_len)
        num_aligned[gene_name] += 1
    for segment in gene_type_dict:
        segment_dict = gene_type_dict[segment]
        for gene_name in segment_dict:
            num_aligned_seq = num_aligned[gene_name]
            if num_aligned_seq < 10:
                continue
            output_fname = os.path.join(
                output_config.GetSHMDirBySegment(segment), gene_name)
            nucl_pos_dict = OutputGeneSHMPlot(segment_dict[gene_name],
                                              gene_name, gene_len[gene_name],
                                              num_aligned[gene_name],
                                              output_fname,
                                              output_config.Log())
            output_config.AddSHMFileForSegment(segment, output_fname)
            OutputGeneSHMsToTxt(
                nucl_pos_dict, num_aligned[gene_name],
                os.path.join(output_config.GetSHMDirBySegment(segment),
                             gene_name) + '.txt')
Пример #3
0
 def _CreateVJDicts(self):
     self.vj_dict = dict()
     self.v_dict = dict()
     self.j_dict = dict()
     for i in range(len(self.vj_df)):
         base_v = utils.GetBaseName(self.vj_df['V_hit'][i])
         base_j = utils.GetBaseName(self.vj_df['J_hit'][i])
         if (base_v, base_j) not in self.vj_dict:
             self.vj_dict[(base_v, base_j)] = 0
         self.vj_dict[(base_v, base_j)] += 1
         if base_v not in self.v_dict:
             self.v_dict[base_v] = 0
         self.v_dict[base_v] += 1
         if base_j not in self.j_dict:
             self.j_dict[base_j] = 0
         self.j_dict[base_j] += 1
     self.sorted_vs = sorted(self.v_dict.keys())
     self.sorted_js = sorted(self.j_dict.keys())
Пример #4
0
def OutputVJGenesMutability(shm_df, output_config):
    v_gene_mutability = dict()
    j_gene_mutability = dict()
    for it in shm_df:
        cur_dict = v_gene_mutability
        if not it.is_variable():
            cur_dict = j_gene_mutability
        gene_name = utils.GetBaseName(it.gene_name)
        if gene_name not in cur_dict:
            cur_dict[gene_name] = []
        mutability = float(len(shm_df[it])) / it.gene_len
        cur_dict[gene_name].append(mutability)
    OutputGeneMutability(v_gene_mutability, output_config.v_mutability, 'V',
                         output_config.Log())
    OutputGeneMutability(j_gene_mutability, output_config.j_mutability, 'J',
                         output_config.Log())
Пример #5
0
def SplitInheritencePattern(InpDir, VCF, PED):
    vcfs = utils.get_files(InpDir, '.tsv')
    peds = utils.get_files(InpDir, '.ped')
    vcf_peds = MatchVcfPed(vcfs, peds)
    #vcf_peds = [[VCF,PED]]
    for vcf, ped in vcf_peds:
        Ped = Pedigree(ped)
        print "Processing vcf: %s\tpedigree: %s" % (vcf, ped)
        fout_AR = open(utils.GetBaseName(vcf) + '_AR.tsv', 'wb')
        fout_AR.write('Autsomal Recessive Variants\n')
        fout_AD = open(utils.GetBaseName(vcf) + '_AD.tsv', 'wb')
        fout_AD.write('Autsomal Dominant Variants\n')
        fout_XL = open(utils.GetBaseName(vcf) + '_XL.tsv', 'wb')
        fout_XL.write('X-linked Variants\n')
        fout_CH = open(utils.GetBaseName(vcf) + '_CH.tsv', 'wb')
        fout_CH.write('Compound Heterozygote\n')

        fin = open(vcf, 'rb')
        header = fin.readline()
        fout_AR.write(header)
        fout_AD.write(header)
        fout_XL.write(header)
        fout_CH.write(header)
        headerList = header.strip().split('\t')
        RecordLen = len(headerList)
        Gene_idx = headerList.index('Annotation')
        Model_idx = headerList.index('GeneticModels')
        Format_idx = headerList.index('FORMAT')
        CH_buffer = []
        LastGene = None
        for l in fin:
            llist = l.strip().split('\t')
            Gene = llist[Gene_idx]
            Model = llist[Model_idx].split(':')[-1]
            if 'AR' in Model and selectAR(llist, headerList, Ped, Model,
                                          Format_idx):
                fout_AR.write(l)
            if 'AD' in Model and selectAD_DN(llist, headerList, Ped, Model,
                                             Format_idx):
                fout_AD.write(l)
            if 'X' in Model:
                fout_XL.write(l)

            if Gene != LastGene:
                if LastGene != None:
                    LookCompoundHeter(CH_buffer, fout_CH, RecordLen,
                                      Format_idx, headerList, Ped)
                CH_buffer = [l]
                LastGene = Gene
            else:
                CH_buffer.append(l)
        LookCompoundHeter(CH_buffer, fout_CH, RecordLen, Format_idx,
                          headerList, Ped)
        fout_AR.close()
        fout_AD.close()
        fout_XL.close()
        fout_CH.close()
        new_dir = os.path.join(InpDir, utils.GetBaseName(vcf))
        print new_dir
        if not os.path.exists(new_dir):
            os.mkdir(new_dir)
        shutil.copy(vcf, new_dir)
        shutil.copy(ped, new_dir)
        for dest in [
                utils.GetBaseName(vcf) + '_AR.tsv',
                utils.GetBaseName(vcf) + '_AD.tsv',
                utils.GetBaseName(vcf) + '_XL.tsv',
                utils.GetBaseName(vcf) + '_CH.tsv'
        ]:
            if os.path.exists(os.path.join(new_dir, dest)):
                os.remove(os.path.join(new_dir, dest))
            shutil.move(dest, new_dir)