Пример #1
0
def maskResiduesNOMAP(refMSA_file, numseq, alnlen, scores, x, formatout, final_file, seqType):
	''' Masks poorly aligned residues whose score is <x. Will NOT mask gaps.'''
	
	new='?'	
	parsed = AlignIO.read(refMSA_file, 'fasta')
	newseqs=[]
	numres=0
	totalmasked=0
	maskedMSA=MultipleSeqAlignment([])
	for row in range(numseq):
		newseq=''
		for position in range(alnlen):
			thispos=str(parsed[row].seq[position])
			if thispos=='-':
				newseq=newseq+parsed[row].seq[position]
			else:
				numres+=1
				thescore=scores[row][position]
				if float(thescore)<float(x): #mask if below threshold. 					
					newseq=newseq+new
					totalmasked+=1
				else: #or, keep that position
					newseq=newseq+parsed[row].seq[position]
		newseqs.append(newseq)
	
	for i in range(numseq):
		if str(seqType)=='protein':
			aln_record=SeqRecord(Seq(newseqs[i],generic_protein), id=str(i+1), description='')
		elif str(seqType)=='dna':
			aln_record=SeqRecord(Seq(newseqs[i],generic_dna), id=str(i+1), description='')
		maskedMSA.append(aln_record)

	outhandle=open(final_file, 'w')
	outhandle.write(maskedMSA.format(str(formatout)))
	outhandle.close()
Пример #2
0
def show_alignments(ali_path, out_format):
    for aln in np.load(ali_path, allow_pickle=True):
        if out_format == 'pir':
            msa = MultipleSeqAlignment([
                SeqRecord(Seq(aln[0], generic_protein),
                          id='Query',
                          name='',
                          description='sequence:::::::::'),
                SeqRecord(Seq(aln[1], generic_protein),
                          id='Template',
                          name='',
                          description='structureX:::::::::')
            ])
        else:
            msa = MultipleSeqAlignment([
                SeqRecord(Seq(aln[0], generic_protein),
                          id='Query',
                          name='',
                          description=''),
                SeqRecord(Seq(aln[1], generic_protein),
                          id='Template',
                          name='',
                          description='')
            ])
        print(msa.format(out_format))
Пример #3
0
def maskResiduesNOMAP(refMSA_file, numseq, alnlen, scores, x, formatout, final_file, seqType):
	''' Masks poorly aligned residues whose score is <x. Will NOT mask gaps.'''
	
	new='?'	
	parsed = AlignIO.read(refMSA_file, 'fasta')
	newseqs=[]
	numres=0
	totalmasked=0
	maskedMSA=MultipleSeqAlignment([])
	for row in range(numseq):
		newseq=''
		for position in range(alnlen):
			thispos=str(parsed[row].seq[position])
			if thispos=='-':
				newseq=newseq+parsed[row].seq[position]
			else:
				numres+=1
				thescore=scores[row][position]
				if float(thescore)<float(x): #mask if below threshold. 					
					newseq=newseq+new
					totalmasked+=1
				else: #or, keep that position
					newseq=newseq+parsed[row].seq[position]
		newseqs.append(newseq)
	
	for i in range(numseq):
		if str(seqType)=='protein':
			aln_record=SeqRecord(Seq(newseqs[i],generic_protein), id=str(i+1), description='')
		elif str(seqType)=='dna':
			aln_record=SeqRecord(Seq(newseqs[i],generic_dna), id=str(i+1), description='')
		maskedMSA.append(aln_record)

	outhandle=open(final_file, 'w')
	outhandle.write(maskedMSA.format(str(formatout)))
	outhandle.close()
Пример #4
0
    def to_string(self, begin=None, end=None, **kwargs):
        # type: (Union[int, None], Union[int, None], Dict[str, Any]) -> str

        format = get_value(kwargs, "format", None)

        if format == "pretty":
            return self._to_string_pretty(begin, end, **kwargs)

        # add markers as sequence records
        seq_records = [
            SeqRecord(Seq(m.to_string(begin, end)), id="#{}".format(m.name))
            for m in self.list_msa_markers
        ]

        if begin is not None or end is not None:
            begin = begin if begin is not None else 0
            end = end if end is not None else self.alignment_length()

        # add actual sequences
        for a in self.list_alignment_sequences:

            if begin is not None or end is not None:
                seq_records.append(a[begin:end])
            else:
                seq_records.append(a)

        # create alignment with markers
        alignment = MultipleSeqAlignment(seq_records)

        return alignment.format("clustal")
Пример #5
0
def alignment_slicer(input, informat, outformat, SNPs, slide):
	alignment =  AlignIO.read(input, informat, alphabet = generic_dna)
	alignment_seq_count = len(alignment)
	first_seq = (alignment[0].seq)
	length_alignment = len(first_seq)
	chars_to_ignore = ['N']
	
	start = 0
	end = start + args.SNPs_in_window
	while end <= length_alignment:
		with open(input+'_site'+str(start)+'to'+str(end)+'.'+outformat, 'w') as output_handle:
			
	# 		print 'start:', start
	# 		print 'end:', end
			alignment_iteration = MultipleSeqAlignment(alignment[:, start:end], alphabet=generic_dna)
			if outformat.lower() == 'nexus':
				n_alignments = []
				alignment_iteration = alignment_iteration.format('nexus')
				n_alignments.append(('site'+str(start)+'to'+str(end),Nexus.Nexus(alignment_iteration)))
				combined = Nexus.combine(n_alignments)
				combined.write_nexus_data(output_handle)
			else:
				AlignIO.write(alignment_iteration, output_handle, outformat)
	# 		print alignment_iteration
			start += args.slide
			end += args.slide
	else:
		with open(input+'_site'+str(start)+'to'+str(length_alignment)+'.'+outformat, 'w') as output_handle:
			n_alignments = []
	# 		print 'now in else loop\n'
	# 		print 'start:', start
	# 		print 'end:', length_alignment
			alignment_iteration = MultipleSeqAlignment(alignment[:, start:length_alignment], alphabet=generic_dna)
			if outformat.lower() == 'nexus':
				n_alignments = []
				alignment_iteration = alignment_iteration.format('nexus')
				n_alignments.append(('site'+str(start)+'to'+str(end),Nexus.Nexus(alignment_iteration)))
				combined = Nexus.combine(n_alignments)
				combined.write_nexus_data(output_handle)
			else:
				AlignIO.write(alignment_iteration, output_handle, outformat)
	# 		print alignment_iteration
		print "\ndone\n"
Пример #6
0
 def test_alnRemoveGapOnlyCols(self):
     s1 = SeqRecord(Seq('A-TT---TTAA---'),id='s1',name='s1')
     s2 = SeqRecord(Seq('AATT---TTAA---'),id='s2',name='s2')
     aln = MultipleSeqAlignment([s1, s2])
     s1_nogap = SeqRecord(Seq('A-TTTTAA'),id='s1',name='s1')
     s2_nogap = SeqRecord(Seq('AATTTTAA'),id='s2',name='s2')
     alnnogap = MultipleSeqAlignment([s1_nogap, s2_nogap])
     aln = MultipleSeqAlignment([s1, s2])
     # Use format() to report, because the Align objects will be compared
     # by hash values which will not be equal
     self.assertEqual(
         Milraa.alnRemoveGapOnlyCols(aln).format('fasta'),
         alnnogap.format('fasta'))
Пример #7
0
def convert_a2m(ali):
    fh = cStringIO.StringIO(ali)
    msa = AlignIO.read(fh, 'fasta')
    fh.close()
    new_msa = []
    for rec in msa:
        new_seq = Seq(re.sub(r'[a-z.]', '', str(rec.seq)),
                      SingleLetterAlphabet())
        new_rec = rec
        new_rec.seq = new_seq
        new_msa.append(new_rec)
    new_msa = MultipleSeqAlignment(new_msa)
    return new_msa.format('fasta')
Пример #8
0
del letters

print("testing reading and writing clustal format...")
test_dir = os.path.join(os.getcwd(), 'Clustalw')
test_names = ['opuntia.aln', 'cw02.aln']

test_files = []
for name in test_names:
    test_files.append(os.path.join(test_dir, name))

for test_file in test_files:
    # parse the alignment file and get an aligment object
    alignment = AlignIO.read(test_file, "clustal")

    # print the alignment back out
    print(alignment.format("clustal"))

alignment = AlignIO.read(os.path.join(test_dir, test_names[0]), "clustal",
                         alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))

# test the base alignment stuff
print('all_seqs...')
for seq_record in alignment:
    print('description: %s' % seq_record.description)
    print('seq: %r' % seq_record.seq)
print('length: %i' % alignment.get_alignment_length())

print('Calculating summary information...')
align_info = AlignInfo.SummaryInfo(alignment)
dumb_consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6)
simple_consensus = align_info.simple_consensus(ambiguous="N", threshold=0.6)
del letters

print("testing reading and writing clustal format...")
test_dir = os.path.join(os.getcwd(), 'Clustalw')
test_names = ['opuntia.aln', 'cw02.aln']

test_files = []
for name in test_names:
    test_files.append(os.path.join(test_dir, name))

for test_file in test_files:
    # parse the alignment file and get an aligment object
    alignment = AlignIO.read(test_file, "clustal")

    # print the alignment back out
    print(alignment.format("clustal"))

alignment = AlignIO.read(os.path.join(test_dir, test_names[0]),
                         "clustal",
                         alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))

# test the base alignment stuff
print('all_seqs...')
for seq_record in alignment:
    print('description: %s' % seq_record.description)
    print('seq: %r' % seq_record.seq)
print('length: %i' % alignment.get_alignment_length())

print('Calculating summary information...')
align_info = AlignInfo.SummaryInfo(alignment)
consensus = align_info.dumb_consensus()
taxon_sequence_threshold = 0.40

in_alignment=AlignIO.read(open(infile), "nexus", alphabet=Gapped(IUPAC.protein))
alignment_length = in_alignment.get_alignment_length();

out_alignment=MultipleSeqAlignment([], alphabet=Gapped(IUPAC.protein))

for seq_record in in_alignment:
	missing_data = float(seq_record.seq.count("-"))/float(alignment_length)
	print seq_record.id + "\t" + str(missing_data)
	if missing_data < taxon_sequence_threshold:
		out_alignment.append(seq_record)


outname=re.search("(/.+)\.nexus$",infile).group(1) + "." + str(taxon_sequence_threshold)

nexfile=open((outname + ".nexus"), "w")
phyfile=open((outname + ".phylip"), "w")

try:
	nexfile.write(out_alignment.format("nexus"))
	phyfile.write(out_alignment.format("phylip"))
except:
	print "Could not write alignment " + infile + " ", sys.exc_info()[0]
			
nexfile.close()
phyfile.close()


Пример #11
0
if args.extract_ref:
	inddict['reference']='reference'
	seqdict['reference']=[refseq]

#for ind in seqdict.keys():
#	print ind,seqdict[ind]



#generate alignment
for ind in inddict.keys():
	for i in range(1):
#		print ">"+ind+"_"+str(i)
#		print seqdict[ind][i]
		seqrec=SeqRecord(Seq(seqdict[ind][i], generic_dna), id=ind+"_"+str(i), description=ind+"_"+str(i))
#		print seqrec
		seqrecords.append(seqrec)

align = MultipleSeqAlignment(seqrecords)

#print alignemnt in desired formats
for f in formats:
	print "writing to "+f+"\n"
	OUT = open(args.out_prefix+'.'+f[:3],'w')
	OUT.write(align.format(f))
	if f is 'nexus' and args.popmap:
		add_traits_block(p_list=pops_list, i_dict=inddict, nexhandle=OUT)
	OUT.close()

Пример #12
0
def debug_matching(gen,
                   primer_pair,
                   mf,
                   mr,
                   output_file,
                   hanging_primers=False):
    """
    This function computes and displays a single alignment. Used for debugging purposes
    """
    try:
        assert (len(gen) == 1), "Multiple gen sequences detected"
        assert (len(primer_pair) == 1), "Multiple primer pairs detected"
    except (Exception) as e:
        logging.error(e)
        return

    try:
        g = copy.deepcopy(gen)
    except:
        g = gen
        pass

    template, discarded, raw_stats, cooked_stats = compute_gen_matching(
        mf, mr, primer_pair, g, output_file, hanging_primers=hanging_primers)

    if (template.empty):
        logging.warning("No result")
        return

    match_result = template.loc[0]
    pp = primer_pair[next(iter(primer_pair))]
    gen = gen[next(iter(gen))]

    fpos = match_result.at['F_pos'] - 1
    if (fpos < 0):
        gen = '-' * (-fpos) + gen
        fpos = 0

    len_primer = fpos + pp.flen + match_result.at['ampliconLen'] + pp.rlen

    rem_len = len(gen) - (fpos + pp.flen + match_result.at['ampliconLen'] +
                          pp.rlen)

    pp.f.seq = Seq(''.join(pp.f.seq))
    pp.r.seq = Seq(''.join(pp.r.seq))

    pp_aligned = '-' * fpos + pp.f.seq + '-' * match_result.at[
        'ampliconLen'] + pp.r.seq + '-' * rem_len
    pp_aligned = SeqRecord(pp_aligned)
    pp_aligned.id = pp.id
    align = MultipleSeqAlignment([gen, pp_aligned])

    print(align.format("clustal"))

    try:
        with open(output_file + ".txt", 'w') as outfile:
            outfile.write(align.format("clustal"))
            print("Debug saved")
    except (Exception) as e:
        logging.error(e)

    return
Пример #13
0
    def _to_string_pretty(self, begin=None, end=None, **kwargs):
        # type: (Union[int, None], Union[int, None], Dict[str, Any]) -> str

        tag = get_value(kwargs, "tag", "", default_if_none=True)

        self.change_marker("q3prime", new_symbol="*")

        # add markers as sequence records
        seq_records = [
            SeqRecord(Seq(m.to_string(begin, end)), id="#{}".format(m.name))
            for m in self.list_msa_markers
        ]

        if begin is not None or end is not None:
            begin = begin if begin is not None else 0
            end = end if end is not None else self.alignment_length()

        headers_old = [a.id for a in self.list_alignment_sequences]
        headers_new = MSAType._format_headers_pretty(headers_old)

        # add actual sequences
        for i, a in enumerate(self.list_alignment_sequences):
            a.id = headers_new[i]

            if begin is not None or end is not None:
                seq_records.append(a[begin:end])
            else:
                seq_records.append(a)

        # create alignment with markers
        alignment = MultipleSeqAlignment(seq_records)

        output_string = alignment.format("clustal")

        # Remove header
        output_string = output_string.replace("_", " ")

        output_string_array = output_string.split("\n")

        def get_summary_statistics_line_for_alignment():
            # type: () -> str

            ref_position = self.get_mark_position("ref")

            is_lorf = len(set(self[0][0:ref_position])) <= 1

            def count_lorf_targets_near_position(position):
                # type: (int) -> int

                count = 0

                for idx in range(1, self.number_of_sequences()):
                    j = 0
                    while True:
                        if position - j >= 0 and self[idx][position -
                                                           j].isupper():
                            if len(set(self[idx][0:position - j])) <= 1:
                                count += 1
                            break
                        if position + j < self.alignment_length(
                        ) and self[idx][position + j].isupper():
                            if len(set(self[idx][0:position + j])) <= 1:
                                count += 1
                            break

                        j += 1

                return count

            num_targets_that_are_lorf = count_lorf_targets_near_position(
                ref_position)

            return "{}: LORF={}\tTargetLORF={}".format(
                tag,
                str(is_lorf)[0], num_targets_that_are_lorf)

        output_string_array[0] = get_summary_statistics_line_for_alignment()

        output_string = "\n".join(output_string_array)

        return output_string