示例#1
0
def group_clone_VJ_cdr3(dico_same_VJ, dicoSeq, Clone_threshold):
    VJ_ID_diff_CDR3 = {}
    dicoclone_vj_cdr3 = {}

    for VJ_ID in dico_same_VJ.keys():
        sub_sub_group = 0
        #print (VJ_ID,"VJ_ID,VJ_ID")
        VJ_ID_diff_CDR3[VJ_ID] = {}
        for seq in dico_same_VJ[VJ_ID]:
            sub_gourp_dist = {}
            CDR3_seq = dicoSeq[seq.rstrip()][2]
            print(VJ_ID_diff_CDR3[VJ_ID].keys())
            if len(VJ_ID_diff_CDR3[VJ_ID].keys()) != 0:
                Sub_gourp = VJ_ID_diff_CDR3[VJ_ID].keys()
                for g in Sub_gourp:
                    print(g, dicoSeq[seq.rstrip()][2])
                    print(
                        "aaaaa",
                        1 - hamming_distance(dicoSeq[seq.rstrip()][2], g) /
                        float(len(g)))
                    if len(dicoSeq[seq.rstrip()][2]) == len(g):
                        if 1 - (hamming_distance(dicoSeq[seq.rstrip()][2], g) /
                                float(len(g))) >= Clone_threshold:
                            print("here!")
                            sub_gourp_dist[g] = ('+', 1 - (hamming_distance(
                                dicoSeq[seq.rstrip()][2], g) / float(len(g))))

                    elif dicoSeq[seq.rstrip()][1].split("*")[0][-1] == "6":
                        length = max(len(dicoSeq[seq.rstrip()][2]), len(g))
                        if 1 - (levenshtein_distance(dicoSeq[seq.rstrip(
                        )][2], g) / float(length)) >= Clone_threshold:
                            sub_gourp_dist[g] = (
                                '+', 1 -
                                (levenshtein_distance(dicoSeq[seq.rstrip()][2],
                                                      g) / float(length)))

                if sub_gourp_dist == {}:
                    VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq]
                else:
                    dist_loc = {}
                    for key in sub_gourp_dist.keys():
                        seqs = [
                            skbio.Protein(CDR3_seq,
                                          metadata={'id': "CDR3_seq"}),
                            skbio.Protein(key, metadata={'id': "key"})
                        ]
                        #print (seqs[0],seqs[1])
                        msa = skbio.alignment.global_pairwise_align_protein(
                            seqs[0], seqs[1], 25)
                        dist_loc[key] = float(msa[1])
                    print(dist_loc, "dist_loc")
                    best_coressp = (max(dist_loc.items(),
                                        key=operator.itemgetter(1))[0])
                    VJ_ID_diff_CDR3[VJ_ID][best_coressp].append(seq)
            else:
                VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq]
            print(sub_gourp_dist)
    #print (VJ_ID_diff_CDR3)
    return VJ_ID_diff_CDR3
示例#2
0
def trim(OGid):
    # 0 Load MSA
    try:
        msa1 = read_fasta(f'../align_fastas1/out/{OGid}.mfa')
    except FileNotFoundError:
        msa1 = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa')

    # 1 Calculate shared variables
    gaps_array = np.full((len(msa1), len(msa1[0][1])), False)
    for i, (_, seq) in enumerate(msa1):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)
    msa1 = skbio.TabularMSA([skbio.Protein(seq, metadata={'description': header}) for header, seq in msa1])

    # 2 Get trims (segments and columns)
    syms_list1 = trim_conserved(msa1, scores, gaps_array,
                                tp['con_frac'], tp['con_window'], tp['con_minlen'], tp['con_rate'], tp['con_minsig'])
    syms_list2, trims = trim_insertions(msa1, scores, gaps_array,
                                        tp['gap_num'], tp['gap_rate'], tp['gap_minsig'],
                                        tp['nongap_frac'], tp['nongap_minlen'],
                                        tp['gp_sigma'], tp['gd_window'], tp['indel1_rate'], tp['indel2_rate'],
                                        tp['weights'], tp['threshold'],
                                        matrix)

    # 3 Combine trims (segments and columns) to yield final alignment
    msa2 = []
    for seq, syms1, syms2 in zip(msa1, syms_list1, syms_list2):
        syms = ['-' if sym1 != sym2 else sym1 for sym1, sym2 in zip(syms1, syms2)]  # Will only differ if one is converted to gap
        msa2.append((seq.metadata['description'], syms))

    # 4 Restore gap only columns
    gaps_array = np.full((len(msa2), len(msa2[0][1])), False)
    for i, (_, seq) in enumerate(msa2):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)

    rf = ['x' for _ in range(len(msa2[0][1]))]  # Metadata for marking consensus columns in profile HMM
    for region, in ndimage.find_objects(ndimage.label(scores == len(msa2))[0]):
        rf[region] = (region.stop - region.start) * ['.']
        for i in range(len(msa2)):
            syms = msa2[i][1]
            syms[region] = list(str(msa1[i, region]))

    # 5 Write to file
    msa2 = skbio.TabularMSA([skbio.Protein(''.join(syms), metadata={'description': header}) for header, syms in msa2],
                            positional_metadata={'RF': rf})
    msa2.write(f'out/{OGid}.sto', 'stockholm')
示例#3
0
def _series_to_fasta_format(ff, data, sequence_type="DNA"):
    with ff.open() as f:
        for id_, seq in data.iteritems():
            if sequence_type == "protein":
                sequence = skbio.Protein(seq, metadata={'id': id_})
            elif sequence_type == "DNA":
                sequence = skbio.DNA(seq, metadata={'id': id_})
            elif sequence_type == "RNA":
                sequence = skbio.RNA(seq, metadata={'id': id_})
            else:
                raise NotImplementedError(
                    "pd.Series can only be converted to DNA or "
                    "protein FASTA format.")
            skbio.io.write(sequence, format='fasta', into=f)
示例#4
0
rows = []
for OGid in OGids:
    try:
        msa = read_fasta(f'../align_fastas1/out/{OGid}.mfa')
    except FileNotFoundError:
        msa = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa')

    gaps_array = np.full((len(msa), len(msa[0][1])), False)
    for i, (_, seq) in enumerate(msa):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)
    msa = skbio.TabularMSA([
        skbio.Protein(seq, metadata={'description': header})
        for header, seq in msa
    ])

    mask = ndimage.label(len(msa) - scores <= tp['gap_num'])[0]
    regions = [region for region, in ndimage.find_objects(mask)]
    for region in regions:
        for segment in get_segments(msa, region, matrix):
            row = {
                'OGid': OGid,
                'start': segment['region'].start,
                'stop': segment['region'].stop,
                'index': segment['index'],
                'length': sum([s.stop - s.start for s in segment['slices']])
            }
            rows.append(row)