Пример #1
0
def needle_alignment(s1, s2):
    '''
DESCRIPTION

    Does a Needleman-Wunsch Alignment of sequence s1 and s2 and
    returns a Bio.Align.MultipleSeqAlignment object.
    '''
    from Bio import pairwise2
    from Bio.Align import MultipleSeqAlignment
    from Bio.SubsMat.MatrixInfo import blosum62

    def match_callback(c1, c2):
        return blosum62.get((c1, c2), 1 if c1 == c2 else -4)

    alns = pairwise2.align.globalcs(s1,
                                    s2,
                                    match_callback,
                                    -10.,
                                    -.5,
                                    one_alignment_only=True)

    a = MultipleSeqAlignment([])
    a.add_sequence("s1", alns[0][0])
    a.add_sequence("s2", alns[0][1])
    return a
Пример #2
0
def Create_SNP_alignment(alignment, SNPlocations):
	
	alphabet = Gapped(IUPAC.unambiguous_dna)

	SNPalignment = MultipleSeqAlignment(alphabet)

	
	for record in alignment:
		SNPsequenceObject=""
		
		for base in SNPlocations:
			SNPsequenceObject=SNPsequenceObject+record.seq[base].replace("-","?")
		
		SNPalignment.add_sequence(record.id, SNPsequenceObject)
		
	
	return SNPalignment
Пример #3
0
def SubsetAlnCols(args):

    aln_in_fasta = args['in']
    col_range = args['r']
    min_non_empty_pct = args['pct']
    align_subset_out = args['out']

    # get range
    col_start = int(col_range.split('-')[0])
    col_end = int(col_range.split('-')[1])
    align_subset_len = col_end - col_start + 1

    # subset
    align_in = AlignIO.read(aln_in_fasta, 'fasta')
    align_in_subset = align_in[:, (col_start - 1):col_end]

    if min_non_empty_pct is None:

        # write out
        align_file_out_handle = open(align_subset_out, 'w')
        AlignIO.write(align_in_subset, align_file_out_handle, 'fasta')
        align_file_out_handle.close()

    else:

        # filter
        align_in_subset_filtered = MultipleSeqAlignment(
            [], Gapped(IUPAC.unambiguous_dna, "-"))
        for seq_record in align_in_subset:
            non_empty_col_pct = (align_subset_len - seq_record.seq.count('-')
                                 ) * 100 / align_subset_len
            if non_empty_col_pct >= float(min_non_empty_pct):
                align_in_subset_filtered.add_sequence(seq_record.id,
                                                      str(seq_record.seq))

        # write out
        align_file_out_handle = open(align_subset_out, 'w')
        AlignIO.write(align_in_subset_filtered, align_file_out_handle, 'fasta')
        align_file_out_handle.close()
Пример #4
0
def needle_alignment(s1, s2):
    '''
DESCRIPTION

    Does a Needleman-Wunsch Alignment of sequence s1 and s2 and
    returns a Bio.Align.MultipleSeqAlignment object.
    '''
    from Bio import pairwise2
    from Bio.Align import MultipleSeqAlignment
    from Bio.SubsMat.MatrixInfo import blosum62

    def match_callback(c1, c2):
        return blosum62.get((c1, c2), 1 if c1 == c2 else -4)

    alns = pairwise2.align.globalcs(s1, s2,
            match_callback, -10., -.5,
            one_alignment_only=True)

    a = MultipleSeqAlignment([])
    a.add_sequence("s1", alns[0][0])
    a.add_sequence("s2", alns[0][1])
    return a
Пример #5
0
def align_sequences(multiple_sequences, ms_path, msa_path, cluster_number):
    # create alignment object from a list of multiple sequences and write it to a file

    aln = MultipleSeqAlignment(Gapped(IUPAC.protein, '-'))

    # write ms to file
    with open(ms_path, 'w') as f:
        f.write('\n'.join('>' + str(i) + '\n' + multiple_sequences[i]
                          for i in range(len(multiple_sequences))))

    if len(multiple_sequences) > 1:
        # align file
        logger.info('running mafft:')
        mafft_cmd = f'mafft --quiet {ms_path} > {msa_path}'
        logger.info(mafft_cmd)
        os.system(mafft_cmd)
        #avoid removing sparse columns!! causes a bug when trying to grep from annotations!!
        # remove columns with more than 20% gaps
        # remove_sparse_columns(msa_path, msa_path, 0.2)
    else:
        logger.info(
            'Only 1 sequence in {}. Skipping mafft and copying ms file as is.'.
            format(cluster_number))
        with open(msa_path, 'w') as f:
            f.write('\n'.join('>' + str(i) + '\n' + multiple_sequences[i]
                              for i in range(len(multiple_sequences))))

    for record in SeqIO.parse(ms_path, 'fasta'):
        aln.add_sequence(record.id, str(record.seq))

    logger.debug('Alignment is:')
    msa_str = ''
    for record in aln._records:
        msa_str += record.seq._data + '\n'
    logger.debug(msa_str)

    return aln, msa_str.split()
Пример #6
0
 def compute_consensus(self):
     align = MultipleSeqAlignment(Gapped(IUPAC.extended_protein, "-"))
     for i, seq in enumerate(self.msa):
         align.add_sequence(str(i), str(seq))
     summary_align = AlignInfo.SummaryInfo(align)
     self.consensus = summary_align.gap_consensus(threshold=0, ambiguous="-")
Пример #7
0
    def next(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                return None

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []


        while line[0] == "#":
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":",1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":",1)
                    assert i+1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        seq_starts = []
        index = 0

        #Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end
                    if start==end:
                        #Special case, either a single letter is present,
                        #or no letters at all.
                        if seq.replace("-","") == "":
                            start = int(start)
                            end = int(end)
                        else:
                            start = int(start) - 1
                            end = int(end)
                    else:
                        assert seq.replace("-","") != ""
                        start = int(start)-1 #python counting
                        end = int(end)

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id==ids[index] or id == ids[index][:len(id)]

                    if len(seq_starts) == index:
                        #Record the start
                        seq_starts.append(start)

                    #Check the start...
                    if start == end:
                        assert seq.replace("-","") == "", line
                    else:
                        assert start - seq_starts[index] == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \
                            % (len(seqs[index].replace("-","")), index, id, repr(seqs[index]),
                               start, line)
                    
                    seqs[index] += seq

                    #Check the end ...
                    assert end == seq_starts[index] + len(seqs[index].replace("-","")), \
                        "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \
                            % (len(seqs[index].replace("-","")), index, id, repr(seqs[index]),
                               seq_starts[index], end, line)

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                #Just a spacer?
                pass
            else:
                print line
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================":
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = MultipleSeqAlignment(self.alphabet)
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                #EMBOSS 2.9.0 is known to use spaces instead of minus signs
                #for leading gaps, and thus fails to parse.  This old version
                #is still used as of Dec 2008 behind the EBI SOAP webservice:
                #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            alignment.add_sequence(id, seq)
        return alignment
if len(glob.glob(files)) > 0:
    nPass = 0
    for fasta in glob.glob(files):  #	get seqs

        fastaname = fasta.split('/')[-1]  #	get fasta name without path
        aln = AlignIO.read(fasta, "fasta")  #	extract alignment from fasta
        #		nSeq1 = len(aln)
        if include_missing_data == False:
            gap_aln = MultipleSeqAlignment([])
            for current_seq in aln:  #	for each sequence
                seq = str(current_seq.seq)
                seq = str.replace(
                    seq, "N", "-"
                )  # change Ns to gaps to make them invisible to dumb_consensus
                gap_aln.add_sequence(current_seq.id, seq)
                aln = gap_aln
        summary_align = AlignInfo.SummaryInfo(aln)
        consensus_unamb = summary_align.dumb_consensus(
            threshold=identity,
            ambiguous="N",
            consensus_alpha=alpha.IUPACUnambiguousDNA,
            require_multiple=0)
        consensus_unamb = str(consensus_unamb)
        consensus_unamb = str.replace(consensus_unamb, "n", "N")

        outseq = MultipleSeqAlignment([])
        outseq.add_sequence(name, consensus_unamb)
        outpath = '{0}{1}{2}{3}{4}'.format(outdir, "cons_sim", identity, "_",
                                           fastaname)
        f = open(outpath, 'w')
Пример #9
0
def plot_bar(data,nuclstr,column='value',factor=None,ymin=None,ymax=None,stat='identity',dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],usd=False,right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,double_seq=False,transparent=True,fill_params=None,bar_position='stack',title=None):
    """
    A wrapper function to make a plot of data with bars along the sequnce
    input should be a dataframe with resid, segid column and 'value' 
    This one is inspired by seqplot/seqplot/pdb_plot.py
    """
    
    segid=data['segid'].values[0]
    
    if title is None:
        title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type'])
    
    seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \
                if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna)
    msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\
                                         name=nuclstr.components[segid]['type']+':'+segid)])
    if(reverse_seq):
        logger.info("Experimental feature will reverse the sequence")
        msar[0].seq=msar[0].seq[::-1]

    if double_seq:
          msar.add_sequence('reverse',str(msar[0].seq[::-1]))

        
    msar=msar[:,cropseq[0]:cropseq[1]]
        
    
#     print("Seq to plot:",msar)
             
    #We need to get starting residue, currently for DNA chains only cifseq gets it correctly
    resid_start=nuclstr.seqs[segid]['resid_start']
    
    logger.debug("Starting resid",resid_start)
    

    overhang=nuclstr.seqs[segid]['overhangL']
    
    datafixed=data.copy()
    datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0]

    
    sl=len(msar[0].seq)

#     fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug)
    if features is None:
        fn=nuclstr.shading_features[segid]
    else:
        fn=features
    fn2=[]
    for i in fn:
        if (i['style'] in feature_types) or ('all' in feature_types) :
            fn2.append(i)
            
    fn2.extend(add_features)
    if usd:
        ruler='top'
    else:
        ruler=None
    shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,ruler=ruler,density=200)
        
    #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that.
    if right_overhang_fix is None:
        if sl%10==0:
            if sl<100:
                rof= 0.1
            else:
                rof=0.5
        else:
            rof=0
    else:
        rof=right_overhang_fix
    if (not aspect_ratio is None ):
        ar=aspect_ratio
    else:
        ar=0.2*100./sl
#     print(datafixed)
    plot=(ggplot(data=datafixed,mapping=aes(x='resid', y=column))
#         + geom_point(size=0.1)
#           +geom_bar(stat='identity',width=0.5,mapping=aes(fill=factor))
        + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[])
       # + scale_y_continuous(breaks=[0,0.5,1.0])
        + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0,text=element_text(size=6), legend_key_size=5 ,legend_position='bottom',legend_direction='horizontal'))
    #+ facet_wrap('~ segid',dir='v') +guides(color=guide_legend(ncol=10))
    if factor is None:
        plot=plot+geom_bar(stat=stat,width=0.5)
    else:
        plot=plot+geom_bar(stat=stat,width=0.5,mapping=aes(fill=factor),position=bar_position)
        
    if fill_params is not None:
        plot=plot+scale_fill_manual(**fill_params)
    
    if not usd:
        if (ymax is not None) :
            plot=plot+scale_y_continuous(limits=(None,ymax))
    else:
        if (ymin is not None) :
            plot=plot+scale_y_continuous(limits=(ymin,None))
    
    if ymax is None:
        ymax=data[column].max()
    if ymin is None:
        ymin=data[column].min()
#     print(ymax)
    plot = plot + geom_seq_x(seqimg=shaded.img,\
                   xlim=(1,sl+rof),ylim=(ymin,ymax),usd=usd,aspect_ratio=ar,transparent=transparent)+ggtitle(title)
    
    
    return plot
Пример #10
0
    def next(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line: return
        line = line.strip()
        parts = filter(None, line.split())
        if len(parts)!=2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        #Expects STRICT truncation/padding to 10 characters
        #Does not require any white space between name and seq.
        for i in range(0,number_of_seqs):
            line = handle.readline().rstrip()
            ids.append(line[:10].strip()) #first ten characters
            seqs.append([line[10:].strip().replace(" ","")])

        #Look for further blocks
        line=""
        while True:
            #Skip any blank lines between blocks...
            while ""==line.strip():
                line = handle.readline()
                if not line : break #end of file
            if not line : break #end of file

            if self._is_header(line):
                #Looks like the start of a concatenated alignment
                self._header = line
                break

            #print "New block..."
            for i in range(0,number_of_seqs):
                seqs[i].append(line.strip().replace(" ",""))
                line = handle.readline()
                if (not line) and i+1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line : break #end of file

        alignment = MultipleSeqAlignment(self.alphabet)
        for i in range(0,number_of_seqs):
            seq = "".join(seqs[i])
            if len(seq)!=length_of_seqs:
                raise ValueError("Sequence %i length %i, expected length %i" \
                                  % (i+1, len(seq), length_of_seqs))
            alignment.add_sequence(ids[i], seq)
            
            record = alignment[-1]
            assert ids[i] == record.id or ids[i] == record.description
            record.id = ids[i]
            record.name = ids[i]
            record.description = ids[i]
        return alignment
Пример #11
0
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H",
                   "D", "B"]

    ### input directory from s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/")

    if os.path.isdir(output_directory_2) == False:
        os.makedirs(output_directory_2)

    ### iterate each gene
    for file in os.listdir(output_directory_1):
        if file != ".DS_Store":
            output_directory_file = output_directory_2 + file
            fasta_name = output_directory_1 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " + sequence)

                alignment = AlignIO.read(sequence, 'fasta')

                ### calculate the polymorphism in outgroup
                ### change alignment to an array.
                total_wrong_poly_sites_outgroup = []

                align_array_outgroup = np.array([list(rec) for rec in alignment])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = alignment.get_alignment_length()
                # alignment = AlignIO.read(sequence, 'fasta')
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position_outgroup = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array_outgroup[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position_outgroup.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position_outgroup.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position_outgroup) > float(Max_p_sites_o):
                        print(column_position_outgroup)
                        total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup


                unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup))
                print(unique_wrong_sites_ougroup)
                print("outgroup")


                align_2 = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
                for record in alignment:
                    new_seq = ""

                    if record.id in outgroups:
                        print(record.seq)
                        for i in range(total_length):
                            if i in unique_wrong_sites_ougroup:
                                new_seq = new_seq + "-"
                            else:
                                new_seq = new_seq + str(record.seq[i])

                        align_2.add_sequence(str(record.id), str(new_seq))

                    else:
                        align_2.add_sequence(str(record.id), str(record.seq))

                print(align_2)

                AlignIO.write(align_2, output_directory_file, "fasta")
Пример #12
0
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"]

    ### input files are from s6
    genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/")

    ### mkdir output directory for s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    if os.path.isdir(output_directory) == False:
        os.makedirs(output_directory)

    ### iterate each gene
    for file in os.listdir(genes_result_s6):
        if file != ".DS_Store":
            output_directory_file = output_directory + file
            fasta_name = genes_result_s6 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " +sequence)

                alignment = AlignIO.read(sequence, 'fasta')
                # print(alignment)

                ### generate a new alignment sequences without outgroups.
                align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))

                for record in alignment:
                    if record.id not in outgroups:
                        # print(record.id)
                        # print(record.seq)
                        align.add_sequence(str(record.id), str(record.seq))


                print(align)
                # print(align.get_alignment_length())


                total_wrong_poly_sites = []
                ### change alignment to an array.
                align_array = np.array([list(rec) for rec in align])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = align.get_alignment_length()



                ### using 20bp-long sliding windows.
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position) > float(Max_p_sites):
                        print(column_position)
                        total_wrong_poly_sites = total_wrong_poly_sites + column_position

                #print(total_wrong_poly_sites)

                ### generate the unique positions

                total_wrong_poly_sites = total_wrong_poly_sites + list(range(10))
                total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length))
                ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species.
                unique_wrong_sites = list(np.unique(total_wrong_poly_sites))
                print(len(unique_wrong_sites))
                # sum2 = alignment[:, total_length:total_length + 1]
                # for i in unique_wrong_sites:
                #     sum2 = sum2 + alignment[:, i:i+1]
                # print(sum2)
                # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip")


                ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites.
                ### otherwise, copy the gene to the new folder.
                if len(unique_wrong_sites) > 0:

                    print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}"))

                    cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")

                    cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col

                    print(cmd)
                    os.system(cmd)

                else:
                    cmd_2 = "cp " + fasta_name + " " + output_directory_file
                    print(cmd_2)
                    os.system(cmd_2)
        for i in aln_r:  #	for each column in alignment
            site = list(aln_a[:, i])
            nSeq = len(site)  #	count seqs
            nGap_s = float(
                site.count('N') + site.count('n') +
                site.count('-'))  #	count gaps and missing data
            pcGap_s = nGap_s / nSeq * 100
            if pcGap_s > maxpcGap_s:
                delsites.append(i)
            #	if proportion of seqs in the column with missing data is above threshold, delete column

        aln_a = np.delete(aln_a, delsites, 1)

        c = 0
        for current_seq in aln:
            filt1_aln.add_sequence(current_seq.id,
                                   ''.join(map(str, list(aln_a[c, ]))))
            c += 1

        length = filt1_aln.get_alignment_length(
        )  # if length of alignment after column-wise filter is above threshold, continue
        if length >= minLen:
            filt2_aln = MultipleSeqAlignment([])
            for current_seq in filt1_aln:  #	for each sequence
                seq = str(current_seq.seq)
                nGap = float(seq.count('N') + seq.count('n') + seq.count('-'))
                pcGap = nGap / length * 100
                if pcGap < maxpcGap:  #	if proportion of missing data in seq is below threshold, print to filtered alignment
                    filt2_aln.add_sequence(current_seq.id,
                                           str(current_seq.seq))
            filt3_aln = MultipleSeqAlignment([])
            for current_seq in filt2_aln:  #	for each sequence
Пример #14
0
files = '{0}{1}'.format("*.", args.input_extension)
outfile = args.outfile
taxa = args.taxa
taxa = taxa.split(",")
numfiles = len(glob.glob(files))
numtaxa = len(taxa)

line1 = '{0} fasta files and {1} taxa found, alignments will be concatenated and written to {2}\n'.format(
    numfiles, numtaxa, outfile)
print(line1)

if numfiles > 0:
    cataln = MultipleSeqAlignment([])
    for taxon in taxa:
        cataln.add_sequence(taxon, "")  #	make alignment with all required taxa
    for fasta in glob.glob(files):
        fastaname = fasta.split('/')[-1]  #	get fasta name without path
        aln = AlignIO.read(fasta, "fasta")  #	extract alignment from fasta
        seqLen = aln.get_alignment_length()
        newaln = MultipleSeqAlignment([])
        seq = "X"
        for catrec in cataln:  # for each taxon

            catid = str(catrec.id)
            for rec in aln:
                if str(
                        rec.id
                ) == catid:  # find sequence in fasta alignment if it's there
                    seq = str(rec.seq)
            if seq == "X":
Пример #15
0
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

if __name__ == "__main__":
    with open("multialign.txt", "r") as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
        cnt = 0
        align = MultipleSeqAlignment([])
        for seq in lines:
            cnt += 1
            align.add_sequence(f"seq-{cnt}", seq)

        print(align)