def alignSequences(ref_seq, query_seq, mis_allow):
    """remove PAm site """
    ref_seq=ref_seq[:-6]
    match = 2
    mismatch = -1
    ref_length = len(ref_seq) + 6
    matches_required = len(ref_seq) - mis_allow  # allow up to 8 mismatches
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    # sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    forward_alignment = sw.align(ref_seq, query_seq)
    reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq))
    if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches:
        start_pad = forward_alignment.r_pos
        start = forward_alignment.q_pos - start_pad
        end_pad = ref_length - forward_alignment.r_end
        end = forward_alignment.q_end + end_pad
        strand = "+"
        return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 6, end - start, strand, start, end]
    elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches:
        start_pad = reverse_alignment.r_pos
        start = reverse_alignment.q_pos - start_pad
        end_pad = ref_length - reverse_alignment.r_end
        end = reverse_alignment.q_end + end_pad
        strand = "-"
        return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 6, end - start, strand, start, end]
    else:
        return ["", "", "", "", "", ""]
Exemplo n.º 2
0
def  _get_mhc_pep(none_tcr, fasta):
    MATCH = 2
    MISMATCH = -1
    SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
    SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE)
    b2m_found = False

    proteins = {}

    peptides = []
    mhcas = []
    mhcbs = []

    mhc_class = None

    with open(fasta) as f:
        for name, seq in _read_fasta(f):
            letter = name.split(":")[1]
            if letter in none_tcr:
                proteins[letter] = seq

    for test in proteins:
        scores = []
        refs = []
        for reference in mhc_fastas:
            refs.append(reference)
            ref_protein = mhc_fastas[reference]
            hit_protein = proteins[test]

            swout = SMITH_WATERMAN.align(ref_protein, hit_protein)
            swout = swout.score
            scores.append(swout)

        max_score = max(scores)
        max_index = scores.index(max(scores))

        protein_len = len(hit_protein)

        #Low BLAST score
        if max_score < 50:
            #is it a peptide?
            if protein_len <= 40:
                peptides.append(test)
        else:
            ref_name = refs[max_index]

            if ref_name == "B2M":
                mhcbs.append(test)
                b2m_found = True
            elif "D" in ref_name.split("*")[0] and "B" in ref_name.split("*")[0]: #contains a d for class II and b for beta chain
                mhcbs.append(test)
            else:
                mhcas.append(test)

    if b2m_found:
        mhc_class = 1
    else:
        mhc_class = 2

    return peptides, mhcas, mhcbs, mhc_class
 def clculatedelta(s,i,j):
     import swalign;
     dc=0;
     match = 1
     mismatch = -3
     scoring = swalign.NucleotideScoringMatrix(match,mismatch)
     sw = swalign.LocalAlignment(scoring)
     ali1 = sw.align(s[i-1],s[j]);
     ali1.dump();
     ali2 = sw.align(s[i],s[j+1]);
     ali2.dump();
     delt_f = ali1.score + ali2.score;
     f1=ali1.score;
     f2=ali2.score;
     ali1 = sw.align(s[i-1],s[i]);
     ali1.dump();
     ali2 = sw.align(s[j],s[j+1]);
     ali2.dump();
     delt_f = delt_f - (ali1.score + ali2.score);
     f3=ali1.score;
     f4=ali2.score;
     if(f3>30):
         dc=dc+1;
     if(f4>30):
         dc=dc+1;
     if(f1>30):
         dc=dc-1;
     if(f2>30):
         dc=dc-1;
     
     return delt_f,dc;    
def alignSequences(ref_seq, query_seq):
    sys.stderr.write(ref_seq + "\t" + query_seq + "\n");
    match = 2
    mismatch = -1
    ref_length = len(ref_seq)
    matches_required = len(ref_seq) - 1 - 19  # allow up to 20 mismatches
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=-100, gap_extension_penalty=-100, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    #sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True)  # you can also choose gap penalties, etc...
    forward_alignment = sw.align(ref_seq, query_seq)
    reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq))
    sys.stderr.write("fwdmatch: " + str(forward_alignment.matches) + "\n")
    sys.stderr.write("revmatch: " + str(reverse_alignment.matches) + "\n")

    if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches:
        start_pad = forward_alignment.r_pos
        start = forward_alignment.q_pos - start_pad
        end_pad = ref_length - forward_alignment.r_end
        end = forward_alignment.q_end + end_pad
        strand = "+"
        return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 1, end - start, strand, start, end]
    elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches:
        start_pad = reverse_alignment.r_pos
        start = reverse_alignment.q_pos - start_pad
        end_pad = ref_length - reverse_alignment.r_end
        end = reverse_alignment.q_end + end_pad
        strand = "-"
        return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 1, end - start, strand, start, end]
    else:
        return ["", "", "", "", "", ""]
Exemplo n.º 5
0
def getAlign(ref_seq, query_seq):
    match = 1
    mismatch = -5
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...
    alignment = sw.align(ref_seq, query_seq)
    return alignment
Exemplo n.º 6
0
def do_swalign(seq1, seq2, match=2, mismatch=-1, gap_penalty=-2, gap_extension_decay=0.5):
    """
    Align two sequences using swalign
    """
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty, gap_extension_decay=gap_extension_decay)
    aln = sw.align(seq1, seq2)
    return aln
Exemplo n.º 7
0
def smith_water_align(seq1, seq2):
    """
    Applies Smith-Waterman sequence alignment algo. on the two input sequences and returns the score
    :param seq1: aminoacid sequence 1
    :param seq2: aminoacid sequence 2
    :return:
    """
    scoring = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
    align_obj = swalign.LocalAlignment(scoring, GAP_PENALTY,
                                       GAP_EXTEND_PENALTY)
    align = align_obj.align(seq1, seq2)
    return align
Exemplo n.º 8
0
def align(s1, s2, out):
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    # This sets up the aligner object. You must set your scoring matrix, but
    # you can also choose gap penalties, etc...
    sw = swalign.LocalAlignment(scoring)

    # Using your aligner object, calculate the alignment between
    # ref (first) and query (second)
    alignment = sw.align(s1, s2)

    return alignment.identity
Exemplo n.º 9
0
def swFactory():
    match = 2
    mismatch = -1
    gap_penalty = -1
    gap_extension_penalty = -1
    gap_extension_decay = 0.0
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    return swalign.LocalAlignment((scoring),
                                  gap_penalty,
                                  gap_extension_penalty,
                                  gap_extension_decay=gap_extension_decay,
                                  verbose=False,
                                  globalalign=False,
                                  full_query=False)
def init_pop_score(chromosomes):
    pop_fitness = list();
    import swalign
    scoring = swalign.NucleotideScoringMatrix(1,-3)
    sw = swalign.LocalAlignment(scoring)
    for i in range(len(chromosomes)):
        arrayscore=list();
        for j in range(len(chromosomes[i])-1):
            alignment = sw.align(chromosomes[i][j],chromosomes[i][j+1]);
            alignment.dump();
            var = alignment.score;
            arrayscore.append(var);
        pop_fitness.append(sum(arrayscore))
#    print(arrayscore);
    print(pop_fitness)
    return pop_fitness
Exemplo n.º 11
0
def sw_one(query,refseq):
    match = 5
    mismatch = -4
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring,gap_penalty = -30,gap_extension_penalty = -1)
    alignment = sw.align(refseq, query)
    #score = alignment.score
    q_pos = alignment.q_pos
    q_end = alignment.q_end
    r_pos = alignment.r_pos
    #print q_pos, q_end, r_pos, r_end
    q_len = q_end-q_pos
    middle_q = q_pos+0.5*q_len
    middle_r = r_pos+0.5*q_len
    #print query,refseq
    #print middle_q, middle_r
    return middle_q, middle_r
Exemplo n.º 12
0
def assemble_seq(readid2seq, junc_seq, tmp_file_path):

    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    sw = swalign.LocalAlignment(
        scoring)  # you can also choose gap penalties, etc...

    hout = open(tmp_file_path + ".tmp3.assemble_input.fa", 'w')
    for tid in sorted(readid2seq):
        print >> hout, '>' + tid
        print >> hout, readid2seq[tid]
    hout.close()

    hout = open(tmp_file_path + ".tmp3.assemble_output.fq", 'w')
    sret = subprocess.call(
        ["fml-asm", tmp_file_path + ".tmp3.assemble_input.fa"], stdout=hout)
    hout.close()

    if sret != 0:
        print >> sys.stderr, "fml-asm error, error code: " + str(sret)
        sys.exit()

    line_num = 0
    temp_contig = ""
    with open(tmp_file_path + ".tmp3.assemble_output.fq", 'r') as hin:
        for line in hin:
            line_num = line_num + 1
            if line_num % 4 == 2:
                tseq = line.rstrip('\n')

                aln_1 = sw.align(tseq, junc_seq)
                if aln_1.score >= 35:
                    ttcontig = tseq[aln_1.r_end:]
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

                aln_2 = sw.align(tseq, my_seq.reverse_complement(junc_seq))
                if aln_2.score >= 35:
                    ttcontig = my_seq.reverse_complement(tseq[:aln_2.r_pos])
                    if len(ttcontig) > len(temp_contig): temp_contig = ttcontig

    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_input.fa"])
    # subprocess.call(["rm", "-rf", tmp_file_path + ".tmp3.assemble_output.fq"])
    return temp_contig
Exemplo n.º 13
0
def RRGA_score(init_center):
    print(init_center)
    arrayscore = list()
    import swalign
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    for j in range(len(init_center) - 1):

        alignment = sw.align(init_center[j], init_center[j + 1])
        #alignment = sw.align('ACACACTA','AGCACACA');
        alignment.dump()
        var = alignment.score
        arrayscore.append(var)
    init_cen_fitness = sum(arrayscore)

    return init_cen_fitness
Exemplo n.º 14
0
def calculate_missing_start(old, new):
    """
    This function finds the start of the alignment for
    the new IMGT numbered sequence, in essence it jumps
    past the sequences lost in the HMM
    """

    MATCH = 2
    MISMATCH = -1
    SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)

    sw = swalign.LocalAlignment(SCORE)

    new = new.replace(".", "")

    alignment = sw.align(old, new)
    offset = alignment.r_pos

    return offset
Exemplo n.º 15
0
def fastq_trim(fastq,
               linker_5=None,
               linker_3=None,
               out=sys.stdout,
               pct_identity=0.8,
               min_trim=4,
               min_len=25,
               verbose=False,
               quiet=False,
               failed_out=None):
    '''
    fname - the fastq filename
    linker_5 - the 5' linker to remove
    linker_3 - the 3' linker to remove
    out - an output stream (eg: file, stdout)
    pct_identity - the percentage of matches that must be present in the alignment to strip away linkers
    min_trim - the distance away from the edges that the linkers much match w/in
    failed_out - an output for failed reads
    '''

    sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1), -1)
    removed = 0
    trimmed = 0
    is_colorspace = fastq.is_colorspace  # preload to keep reader happy.
    for read in fastq.fetch(quiet=quiet):
        retval = seq_trim(read.name, read.seq, read.qual, linker_5, linker_3,
                          is_colorspace, sw, pct_identity, min_trim, min_len,
                          verbose)
        if not retval:
            if failed_out:
                read.write(failed_out)
            removed += 1
        else:
            n_seq, n_qual = retval

            if len(read.qual) != n_qual:
                trimmed += 1

            read.clone(seq=n_seq, qual=n_qual).write(out)

    if not quiet:
        sys.stderr.write('Trimmed: %s\n' % trimmed)
        sys.stderr.write('Removed: %s (len)\n' % removed)
Exemplo n.º 16
0
def last_fit_score(sequence):
    print(sequence, len(sequence))
    import swalign
    contg = 0
    arrayscore = list()
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    for j in range(len(sequence) - 1):

        alignment = sw.align(sequence[j], sequence[j + 1])
        alignment.dump()
        var = alignment.score
        if (var > 30):
            contg = contg - 1
        else:
            contg = contg + 1
        arrayscore.append(var)
    fitness_value = sum(arrayscore)
    return fitness_value, contg
def swalign_df(ref, query):
    '''
    This function returns swalign info:
    ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar
    '''
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)

    sw = swalign.LocalAlignment(
        scoring, gap_extension_penalty=-2,
        prefer_gap_runs=False)  # you can also choose gap penalties, etc...
    aligned = sw.align(ref, query)  #ref, query
    #return alignment

    align_series = pd.Series([ref, query, aligned.r_pos, aligned.r_end, aligned.q_pos, aligned.q_end,\
             aligned.score, aligned.matches, aligned.mismatches, \
             aligned.identity, cigar_to_align(aligned.cigar)])
    align_series.index = 'ref, query, r_pos, r_end, q_pos, q_end, score, matches, mismatches, identity, cigar'.split(
        ', ')
    return align_series
def main(options):
    """Main logic of the script"""
    if options.assign_score_threshold:
        with open(options.stats) as s:
            stats = {}
            for row in csv.reader(s, delimiter="\t"):
                # print row
                stats[row[0]] = row[1]
            score_thersh = float(stats["Score threshold"])
    else:
        score_thersh = 0.0

    if options.five_prime_adapter or options.three_prime_adapter:
        scoring = swalign.NucleotideScoringMatrix(match=1, mismatch=-1)
        sw = swalign.LocalAlignment(scoring,
                                    gap_penalty=-6,
                                    gap_extension_penalty=-4)

    with open(options.input) as f:
        with open(options.output, 'w') as out:
            if options.filter_ambiguous:
                for row in csv.reader(f, delimiter='\t'):
                    if row[6] == "NA":
                        continue
                    if len(row[6]) >= options.length and float(
                            row[2]) >= score_thersh and row[7] == "Unique":
                        if options.five_prime_adapter is not None:
                            alignment_five = sw.align(
                                options.five_prime_adapter, row[6])
                            score_five = alignment_five.score
                            normed_five = score_five / float(len(row[6]))
                        else:
                            normed_five = 0.0
                        if options.three_prime_adapter is not None:
                            alignment_three = sw.align(
                                options.three_prime_adapter, row[6])
                            score_three = alignment_three.score
                            normed_three = score_three / float(len(row[6]))
                        else:
                            normed_three = 0.0

                        if normed_three < options.three_prime_adapter_threshold and normed_five < options.five_prime_adapter_threshold:
                            out.write(">%s:N\n%s\n" % (row[0], row[6]))
                            for match in re.finditer("C", row[6]):
                                out.write(">%s\n%s\n" %
                                          (row[0] + ":" + str(match.span()[0]),
                                           row[6][:match.span()[0]] + "T" +
                                           row[6][match.span()[1]:]))
            else:
                counter_adapt = 0
                counter = 0
                for row in csv.reader(f, delimiter='\t'):
                    if row[6] == "NA":
                        continue
                    if len(row[6]) >= options.length and float(
                            row[2]) >= score_thersh:
                        counter += 1
                        if options.five_prime_adapter is not None:
                            alignment_five = sw.align(
                                options.five_prime_adapter, row[6])
                            score_five = alignment_five.score
                            normed_five = score_five / float(len(row[6]))
                        else:
                            normed_five = 0.0
                        if options.three_prime_adapter is not None:
                            alignment_three = sw.align(
                                options.three_prime_adapter, row[6])
                            score_three = alignment_three.score
                            normed_three = score_three / float(len(row[6]))
                        else:
                            normed_three = 0.0

                        if normed_three < options.three_prime_adapter_threshold and normed_five < options.five_prime_adapter_threshold:
                            counter_adapt += 1
                            out.write(">%s:N\n%s\n" % (row[0], row[6]))
                            for match in re.finditer("C", row[6]):
                                out.write(">%s\n%s\n" %
                                          (row[0] + ":" + str(match.span()[0]),
                                           row[6][:match.span()[0]] + "T" +
                                           row[6][match.span()[1]:]))

                if options.verbose:
                    syserr(
                        "%i seqs was removed out of %s because they were similar to the adapter\n"
                        % (counter - counter_adapt, counter))
Exemplo n.º 19
0
def annot_sbinsert(infile):
    #choose your own values here… 2 and -1 are common.
    match = 2
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    
    file_path = pathlib.Path(infile)
    name = file_path.stem
    outfile_full = name+'.ann.fil.txt'
    outfile = name+'.ann.fil.strict.txt'
    
    #5’- GTGTATGTAAACTTCCGACTTCAACTG ---TA
    seq_dict = {'CGACTTCA': -4,'GACTTCAA': -3,'ACTTCAAC': -2,'CTTCAACT': -1,'TTCAACTG': 0}
    
    with open(f'./break/{infile}', 'r') as hin, open(f'./break/{outfile}', 'w') as h1out, open(f'./break/{outfile_full}', 'w') as h2out:
        next(hin)
        header = '\t'.join(['chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases'])
        h1out.write(header+'\n')
        h2out.write(header+'\n')
        
        for line in hin:

            F = line.rstrip('\n').split('\t')
            #seq ='GTGTATGTAAACTTCCGACTTCAACTGTAATTCTCTGAATGG'
            chr = F[0]
            position = F[1]
            read_direction = F[3]
            reads = F[4]
            seq = F[6]
            sb_length = int(F[2])
            sb_direction = F[7]
            break_motif = seq[sb_length-8:sb_length]
            j = 0
            #check sb motif in nearby break position.
            if break_motif in seq_dict.keys():
                sb_motif = '+'
                adj = seq_dict.get(break_motif)
                if adj == -4 and seq[sb_length-8:sb_length+4] == 'CGACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+4:sb_length+6]
                elif adj == -3 and seq[sb_length-8:sb_length+3] == 'GACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+3:sb_length+5]
                elif adj == -2 and seq[sb_length-8:sb_length+2] == 'ACTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+2:sb_length+4]
                elif adj == -1 and seq[sb_length-8:sb_length+1] == 'CTTCAACTG':
                    j = 1
                    genome2base = seq[sb_length+1:sb_length+3]
                elif adj == 0 and seq[sb_length-8:sb_length] == 'TTCAACTG':
                    j = 1
                    genome2base = seq[sb_length:sb_length+2]
                if j == 1:
                    #adj_seq ori_sb_genome_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    adj_seq = seq[0:sb_length+int(adj)*-1] + '|' + seq[sb_length+int(adj)*-1:]
                    if read_direction == '-':
                        adj_position = int(position) + int(adj)
                    if read_direction == '+':
                        adj_position = int(position) + int(adj)*-1
                else:
                    adj_position = int(position)
                    sb_motif = '-'
                    adj_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    genome2base = seq[sb_length:sb_length+2]
            else:
                #adj_position = original position
                adj_position = int(position)
                sb_motif = '-'
                genome2base = seq[sb_length:sb_length+2]
                adj_seq = seq[0:sb_length] + '|' + seq[sb_length:]
                    
            #swalign
            alignment = sw.align(seq, 'GTGTATGTAAACTTCCGACTTCAACTG')
            sw_match = alignment.matches
            #sw_cigar = alignment.cigar 
            #ratio sw_match score/soft-clipping length
            sw_ratio = round(float(sw_match/sb_length),2)
            
            #'chr','position(0-start)','ori_position','family_size','sb_direction','adj_seq','soft_clip_len','sw_match','sw_match_ratio','sb-seq|','|genome2bases','ori_sb|genome_seq'
            rec = str(chr)+'\t'+str(adj_position)+'\t'+str(position)+'\t'+str(reads)+'\t'+str(sb_direction)+'\t'+str(adj_seq)+'\t'+ \
                    str(sb_length)+'\t'+str(sw_match)+'\t'+str(sw_ratio)+'\t'+ \
                    str(sb_motif)+'\t'+str(genome2base)+'\n'         
                    # filtering: reads>=3, sb_length =22~30 and sw_match_ratio>0.9 PASS
            if int(reads)>=3 and float(sw_ratio)>0.9:
                h2out.write(rec) 
            if int(reads)>=3 and float(sw_ratio)>0.9 and int(sb_length)>=22 and int(sb_length)<=30 and str(sb_motif)=='+':
                h1out.write(rec)
            else: continue        
def scores(sequence1,matrix1):
    match = 1
    mismatch = -3
    scoring = swalign.NucleotideScoringMatrix(match,mismatch)
    sw = swalign.LocalAlignment(scoring)
    #matrix = np.zeros((len(sequence1),len(sequence1)));
    for i in range(len(sequence1)):
        for j in range(len(sequence1)):
            j=j+i;
            if(i==j):
                continue;
            if(j>len(sequence1)-1):
                break;
            
            alignment = sw.align(sequence1[i],sequence1[j]);
        #alignment = sw.align('ACACACTA','AGCACACA');
            alignment.dump();
            var = alignment.score;
            matrix1[i][j] =var;
            matrix1[j][i] =var;
                 #arry.append(var); 
#print(matrix) 
    return matrix1


#def fitness_chromose(chromes_array):
#     
#     match = 2
#     mismatch = -1
##     fitness_scor=list();
#     fitness_scor=[]
#     fit_scor_chroms=[]
##     fit_scor_chroms=list();
#     fitness_scor='';
#     fit_scor_chroms='';
#     for kk in range(len(chromes_array)):
#         for k in range(len(chromes_array[kk])-1):
#             scoring = swalign.NucleotideScoringMatrix(match,mismatch)
#             sw = swalign.LocalAlignment(scoring)
#             alignment = sw.align(chromes_array[k],chromes_array[k+1]);
#             alignment.dump();
#             var = alignment.score;
##             fitness_scor.append (var)
#             import numpy as np
#             fitness_scor=np.array([var])
#         fitness_scor=sum(fitness_scor)
##             print(fitness_scor,'varrrrrrrrr')
##             fitness_scor.append(var)
#             #dddddddddddd
#         fit_scor_chroms=np.array([ fitness_scor])
##         fit_scor_chroms.append([fitness_scor])
#         fitness_scor=[];
#     print(fit_scor_chroms,'uzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz')
#     return fit_scor_chroms
#          










##*****************************************************************###
#import numpy as np
#import copy; 
#def center(center1,chromosomes):
##center=['TTAA','CCGA','CTTA','AAAT','TTCG','GGCA','AAATC'];
#    rand_order=np.random.randint(4,size=(len(center1)+1));
##print(rand_order,'random')
#    len_pop=2
##    chromosomes=list();
#    chrom=list();
#    chrom=copy.deepcopy(center1);     # center use swaping order to generate chromosomes
##print(chrom,('init chrom'))
#    for k in range(len_pop):
#        for z in range(len(center1)):
#            print(rand_order,'random order')
#            print(type(rand_order),'typerandom order')
#            print(z,'L')
#            ind=rand_order[0,z]
#            print(ind,'ind')
#            #if(ind!=len(center1)):
#            print(type(chrom),'uzma')
#            s=chrom[ind]
#            print(chrom,'chromee')
#            print(ind,'ind')
#            print(rand_order,'order')
#            chrom[ind]=chrom[ind+1]           # swap order to generate chromosomes
#            chrom[ind+1]=s
##            else:
##              s=chrom[len(center)]
##              print(chrom,'chromee')
##              print(ind,'ind')
##              chrom[ind]=chrom[1]           # swap order to generate chromosomes
##              chrom[1]=s  
#    #print(chrom,('updat chorm'))
#        chromosomes.append(chrom);
#    #print(chromosom,('append chorm'));
#   # print(chrom,(' charm'))
#        chrom=[];
#        chrom=copy.deepcopy(center1);
##    print(center,('checking chrom'));
##    print(chrom,('new center'));
#        rand_order='';
#        rand_order=np.random.randint(5, size=(1,len(center1)+1))
#    #print(rand_order)
#    return chromosomes
Exemplo n.º 21
0
for line in f:
    if re.match('^>', line):
        b.append(next(f))
f.close()

#MANIPULATED VARIABLE, CAN CHANGE TO YOUR PREFERENCES
match = 1
mismatch = -3
gap = -1

fw = open(
    'Matrix_swalign_{0}_match={1}_mismatch={2}.txt'.format(
        filename, match, mismatch), 'w+')

scoring = swalign.NucleotideScoringMatrix(match, mismatch)
sw = swalign.LocalAlignment(scoring)

for i in range(len(b)):
    for j in range(len(b)):
        if i == j:
            print >> fw, "0,",
        else:
            scoring = swalign.NucleotideScoringMatrix(match, mismatch)
            sw = swalign.LocalAlignment(
                scoring, gap)  #CAN ADD MORE VARIABLE. REFER SWALIGN FILES
            a = sw.align(b[i], b[j])
            s = a.dump()
            print >> fw, "{0},".format(s),
    print >> fw, "\n",
Exemplo n.º 22
0
__author__ = 'michael'
'''
F**K SO STUPEEED
'''

import os
import sys
from collections import defaultdict
from ast import literal_eval
import string
import swalign
import zipfile
SCORING = swalign.NucleotideScoringMatrix()
ALIGNER = swalign.LocalAlignment(SCORING, globalalign=True, gap_penalty=-5)
UPPERCASE = set(string.ascii_uppercase)
AGREEMENT_THRESHOLD = .99
PRIOR_WEIGHT = 3


def read_reference(ref_fn):
    with open(ref_fn, 'r') as ref_file:
        genome_name = ref_file.readline().strip()[1:]
        chrom_name = ref_file.readline().strip()[4:]
        ref = ''.join([line.strip() for line in ref_file])
    return genome_name, chrom_name, ref


def process_line(line):
    """
    :param line:
    :return:
Exemplo n.º 23
0
def align_filter(ref, query, mode, fusion_name=''):
    """
    Aligns query to reference CDS sequence using the Smith-Waterman algorithm. Returns None if the
    alignment is clipped at the fusion boundary.

    :param str ref: In-frame reference transcript
    :param str query: Query transcript
    :param str mode: 'donor' or 'acceptor'
    :return: Alignment features
    :rtype: namedtuple
    """
    alignment_stats = collections.namedtuple(
        'AlignStats', 'qstart, qstop, rstart, rstop, insertions, deletions')

    bounds_regex = re.compile(
        r'Query\s*:\s*(?P<qstart>\d*)\s*\w*\s*(?P<qstop>\d*)\s*[\|\s]*\s*Ref\s*:\s*(?P<rstart>\d*)\s*\w*\s*(?P<rstop>\d*)'
    )
    match_regex = re.compile(r'Matches: \d+\s\((?P<percent>\d*)')

    match = 5
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    sw = swalign.LocalAlignment(scoring)
    alignment = sw.align(ref, query)

    # First check that the donor sequence is in frame
    insertions = 0
    deletions = 0
    for chr, num in alignment.cigar:
        if chr == 'I':
            insertions += num

        elif chr == 'D':
            deletions += num

    # Next grab the alignment statistics
    string = StringIO()
    alignment.dump(out=string)
    dump = string.getvalue()
    string.close()

    # If it's not a near perfect match, then the quality of the assembly may not be good
    m = match_regex.search(dump)
    if m:
        percent = int(m.group('percent'))
        if percent < 99:
            # print('Percent matching %d' % percent)
            # print(dump)
            logging.debug('%s: low percent matching %d' %
                          (fusion_name, percent))
            return

    # If the fusion transcript passes these filters, then grab the bounds of the alignment
    s = bounds_regex.search(dump)
    if s:
        qstart = int(s.group('qstart')) - 1  # Make zero-based
        qstop = int(s.group('qstop'))

        # If the end of the fusion transcript doesn't align, then skip this transcript
        if mode == 'donor' and qstop != len(query):
            logging.debug(
                '%s: donor alignment does not include end of sequence' %
                fusion_name)
            return

        elif mode == 'acceptor' and qstart != 0:
            logging.debug(
                '%s: acceptor alignment does not include start of sequence' %
                fusion_name)
            # print('Acceptor doesn\'t start at one')
            # print(dump)
            return

        rstart = int(s.group('rstart')) - 1  # Make zero-based
        rstop = int(s.group('rstart'))

        return alignment_stats(qstart, qstop, rstart, rstop, insertions,
                               deletions)

    else:
        return
Exemplo n.º 24
0
import swalign
import subprocess
import sys

from Bio.SeqIO import convert as bio_convert

# Global Parameters
MATCH = 2
MISMATCH = -1
SW_SCORE = swalign.NucleotideScoringMatrix(MATCH, MISMATCH)
SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE)


def _read_fasta(fp):
    name = None
    seq = []

    for line in fp:
        line = line.rstrip()
        if line.startswith(">"):
            if name:
                yield (name, ''.join(seq))
            name, seq = line, []
        else:
            seq.append(line)
    if name:
        yield (name, ''.join(seq))


def _missing_elements(nums):
    nums = list(map(int, nums))
def get_smith_waterman():
    match = 4
    mismatch = -1
    scoring = swalign.NucleotideScoringMatrix(match, mismatch)
    return swalign.LocalAlignment(scoring, globalalign=True)
Exemplo n.º 26
0
import sys
import swalign

scoring = swalign.NucleotideScoringMatrix()
sw = swalign.LocalAlignment(scoring)


def selfanneal(s):
    aln = sw.align(s, swalign.revcomp(s))
    return aln.score


if __name__ == "__main__":
    for s in sys.argv[1:]:
        print(selfanneal)
Exemplo n.º 27
0
Note 2: This isn't appropriate for color-space FASTQ files with a prefix base
        included in the read sequence, since it trims an equal number of bases
        from the sequence and quality FASTQ lines.
'''

import sys
import os
import gzip

from ngsutils.support import revcomp, FASTA
from ngsutils.fastq import FASTQ

import swalign

sw = swalign.LocalAlignment(swalign.NucleotideScoringMatrix(2, -1))


def fastx_barcode_split(reader,
                        outtempl,
                        barcodes,
                        edits=0,
                        pos=0,
                        allow_revcomp=False,
                        gzip_output=False,
                        stats_fname=None):
    '''
    Split FAST[QA] reads from {fname} using {barcodes} (hash) to write them to
    output files named like {templ}.
    '''
Exemplo n.º 28
0
#!/usr/bin/env python2

import swalign
from . import generic_functions

# global paramaters
SW_SCORE = swalign.NucleotideScoringMatrix(2, -1)
SMITH_WATERMAN = swalign.LocalAlignment(SW_SCORE)


def read_fasta(fp):
    name, seq = None, []
    for line in fp:
        line = line.rstrip()
        if line.startswith(">"):
            if name: yield (name, ''.join(seq))
            name, seq = line, []
        else:
            seq.append(line)
    if name: yield (name, ''.join(seq))


def grab_anarci_numbers(anarci_file):
    start = False

    alpha_num = []
    beta_num = []

    alpha_seq = ""
    beta_seq = ""
    previous_number = ""