示例#1
0
	def start(self, database, expanded, to_examine, working=''):
		if working!='':
			if working[-1]!='/':
				working+='/'
			if '../' in working:
				dir = os.path.dirname(__file__)
				working = os.path.join(dir, working)
			os.chdir(working)
		else:
			working=os.path.dirname(__file__)

		if '../' in database or database[0]!='/':
			dir = os.path.dirname(working)
			database = os.path.join(dir, database)
		if '../' in expanded or expanded[0]!='/':
			dir = os.path.dirname(working)
			expanded = os.path.join(dir, expanded)
		if '../' in to_examine or to_examine[0]!='/':
			dir = os.path.dirname(working)
			to_examine = os.path.join(dir, to_examine)

		infile_filename = expanded.split('/')[-1]
		infiledir = expanded.split(infile_filename)[0]
		extracted_path=infiledir+'extracted_genes/'
		if not os.path.exists(extracted_path):
			os.makedirs(extracted_path)	

		#print extracted_path, "test'"

		with open(to_examine, 'r') as file:
			all_examine=file.readlines()
			file.close()

		#print all_examine[:4]

		examin_headers=[]
		for line in all_examine:
			if '>' in line:
				examin_headers.append(line.split('>')[1])
			elif '\n'!=line:
				examin_headers.append(line)
		#print len(examin_headers)

		with open(expanded, 'r') as file:
			all_exapnded =file.readlines()
			file.close()
		#print all_exapnded[:4]
		x=0
		while x <len(all_examine):
			#print [all_examine[x]]
			all_examine[x]=str(all_examine[x]).replace('\t','').replace('\n','')

			x+=1

		#print all_exapnded[0:3]
		clustered_headers=[]
		clusters=[]
		record=False
		y=0
		t=True
		#print examin_headers[:4]
		for header in examin_headers:
			
			for line in all_exapnded:
				#if t==True:
				#	print 'x'
				if record==True:
					if '--------' in line:
						
						record=False
						clustered_headers.append(clusters)

						clusters=[]
					else:
						clusters.append(line)

				if ' 'in header:
					#if t==True:
					#	t=False
					#	print '-'+header.split(' ')[0].replace('\n','')

					if '-'+header.split(' ')[0].replace('\n','') in line:
						clusters.append(header.split(' ')[0])
						record=True
				else:
					#if t==True:
					#	t=False
					#	print '-'+header.split(' ')[0].replace('\n','')

					if '-'+header.replace('\n','')  in line:
						clusters.append(header.replace("signalal","signal"))
						#print "test"
						record=True
		clustered_headers.append(clusters)	

		#print clustered_headers
		#print len(clustered_headers)
		#print len(clustered_headers[0])


		with open(database,'r') as file:
			all_genes=file.readlines()
			file.close()

		record=False
		#print clustered_headers[:2]
		#print len(clustered_headers)
		
		
		to_edit = all_genes
		all_genes=[]
		for gene in to_edit:
			if ' 'in gene:
				all_genes.append(gene.split(' ')[0]) #turn this into a \n replace as well as a " " replace with _. problem is signal peptide spelled signalal
			else:
				all_genes.append(gene)

		#print [clustered_headers[-1][0]]
		#print [all_genes[0]]
		#print len(all_genes)

		reverse=False
		check=True
		for cluster in clustered_headers:
			z=0
			to_write=''

			while z<len(cluster):
				x=0
				if '63_fo_mel_Fom013_contig_1806:3631-3882' in cluster[z]:
					print("ahhhhhh")
					print(cluster[z])

				if '_Reversed' in cluster[z].replace('\n',''):
					reverse=True
					to_reverse=''
				while x<len(all_genes):

					if record==True and reverse==False:	
						if '63_fo_mel_Fom013_contig_1806:3631-3882' in cluster[z]:
							print("ahhhhhh")
						if '>' in all_genes[x]:
							record=False
							to_write+='\n'							
						else:
							to_write+=all_genes[x]

					elif record==True and reverse ==True:	
						if '>' in all_genes[x]:
							record=False
							reverse=False
							my_seq=Seq.Seq(str(to_reverse))
							rrr=my_seq.reverse_complement()

							to_write+=str(rrr).replace('\n','')+'\n'						
						else:
							to_reverse+=all_genes[x].replace('\n','')
							
					if reverse==True:

						if ">"+cluster[z].split('_Reversed')[0] == all_genes[x].split(" ")[0].replace('\n','') and cluster[z]!='\n':
							to_write+='>'+cluster[z]
							record=True
					else:
						if ">"+cluster[z].replace('\n',"") == all_genes[x].split(" ")[0].replace('\n','') and cluster[z]!='\n':
							to_write+='>'+cluster[z]
							record=True
					x+=1
				z+=1
			if cluster != []:
				#print extracted_path
				with open(extracted_path+cluster[0].replace('\n','')+'_extracted.fasta','w') as file:
					file.write(to_write)
					file.close
示例#2
0
    def test_append_proteins(self):
        self.test_chars.append(Seq.Seq("K"))
        self.test_chars.append(Seq.Seq("K-"))
        self.test_chars.append(Seq.Seq("K@"))

        self.assertEqual(7, len(self.test_chars))
示例#3
0
 def setUp(self):
     sequence = b"TCAAAAGGATGCATCATG"
     self.s = Seq.Seq(sequence)
     self.mutable_s = Seq.MutableSeq(sequence)
示例#4
0
 def test_concatenation_of_seq(self):
     t = Seq.Seq("T")
     u = self.s + t
     self.assertEqual(str(self.s) + "T", u)
     self.assertEqual(self.s + Seq.Seq("T"), "TCAAAAGGATGCATCATGT")
示例#5
0
 def test_not_equal_comparsion(self):
     """Test __ne__ comparison method."""
     self.assertNotEqual(Seq.Seq("TCAAA"), Seq.Seq("TCAAAA"))
示例#6
0
def project_to_genbank(filename, project, allblocks, construct_id=None):
    if construct_id is not None:
        blocks = [construct_id]
    else:
        blocks = project["components"]

    seq_obj_lst = []

    # For each of the construct in the project
    for block_id in blocks:
        block = [b for b in allblocks if b["id"] == block_id][0]
        if not block:
            continue

        # Grab the original ID that came from genbank before if available, otherwise the GD Name as the name
        if "genbank" in block["metadata"] and "id" in block["metadata"][
                "genbank"]:
            genbank_id = block["metadata"]["genbank"]["id"]
        elif "genbank" in block["metadata"] and "name" in block["metadata"][
                "genbank"]:
            genbank_id = block["metadata"]["genbank"]["name"]
        else:
            genbank_id = "GC_DNA"

        sequence = build_sequence(block, allblocks)
        seq_obj = SeqIO.SeqRecord(
            Seq.Seq(sequence, Seq.Alphabet.DNAAlphabet()), genbank_id)

        # Create a 'source' feature
        sf = SeqFeature.SeqFeature()
        sf.type = "source"
        sf.location = SeqFeature.FeatureLocation(0, len(seq_obj.seq))

        add_GC_info(sf, block, allblocks)

        if "genbank" in block["metadata"]:
            # Set up all the annotations in the genbank record. These came originally from genbank.
            if "annotations" in block["metadata"]["genbank"]:
                for annot_key, annot_value in block["metadata"]["genbank"][
                        "annotations"].iteritems():
                    seq_obj.annotations[annot_key] = annot_value
            # Set up all the references in the genbank record. These came originally from genbank.
            if "references" in block["metadata"]["genbank"]:
                for ref in block["metadata"]["genbank"]["references"]:
                    genbank_ref = SeqFeature.Reference()
                    genbank_ref.authors = ref['authors']
                    genbank_ref.comment = ref['comment']
                    genbank_ref.consrtm = ref['consrtm']
                    genbank_ref.journal = ref['journal']
                    genbank_ref.medline_id = ref['medline_id']
                    genbank_ref.pubmed_id = ref['pubmed_id']
                    genbank_ref.title = ref['title']
                    if "references" not in seq_obj.annotations:
                        seq_obj.annotations["references"] = []
                    seq_obj.annotations["references"].append(genbank_ref)
            # Add the original annotations to the source feature
            if "feature_annotations" in block["metadata"]["genbank"]:
                for annot_key, annot_value in block["metadata"]["genbank"][
                        "feature_annotations"].iteritems():
                    sf.qualifiers[annot_key] = annot_value

        seq_obj.features.append(sf)

        if "description" in block["metadata"]:
            seq_obj.description = block["metadata"]["description"]
        if "genbank" in block["metadata"] and "name" in block["metadata"][
                "genbank"]:
            seq_obj.name = block["metadata"]["genbank"]["name"]
        elif "name" in block["metadata"]:
            seq_obj.name = block["metadata"]["name"].replace(" ", "")[:5]
        else:
            seq_obj.name = "GC_DNA"

        convert_annotations(block, seq_obj, 0)

        # Add a block for each of the features, recursively
        start = 0
        for child_id in block['components']:
            child_block = [b for b in allblocks if b["id"] == child_id][0]
            start = add_features(child_block, allblocks, seq_obj, start)

        seq_obj_lst.append(seq_obj)

    SeqIO.write(seq_obj_lst, open(filename, "w"), "genbank")
示例#7
0
 def test_gapped_seq_no_gap_char_given(self):
     seq = Seq.Seq("ATG---AAACTG")
     self.assertRaises(TranslationError, seq.translate, gap=None)
'''

Use a Seq object for a single sequence like a string.

-----------------------------------------------------------
(c) 2013 Allegra Via and Kristian Rother
    Licensed under the conditions of the Python License

    This code appears in section 19.3.1 of the book
    "Managing Biological Data with Python".
-----------------------------------------------------------
'''

from Bio import Seq

my_seq = Seq.Seq("AGCATCGTAGCATGCAC")
print my_seq[0]
print my_seq[0:3]
print my_seq.split('T')
print my_seq.count('A')
print my_seq.count('A') / float(len(my_seq))
def readToPrimerNonTargets(read, maxPrimerNonspec, refFa, primersInfo=None):
    indelsPat = re.compile('(\d+)([ID])')
    matchPat = re.compile('(\d+)M')
    # Extract strand of the main match in genome
    if read.flag == 0:
        strand = 1
    elif read.flag == 16 or read.flag == 20:
        strand = -1
    elif read.flag == 4:
        return (read.qname, [])
    else:
        print('ERROR! Unknown value of FLAG:', read.flag)
        print(read)
        exit(1)
    if primersInfo:
        primersName = read.qname
        for primerPairNum, primerPairInfo in primersInfo.items():
            if primersName in primerPairInfo[1:3]:
                primerNumInPair = primerPairInfo[1:3].index(primersName)
                infoStrand = int((-1)**primerNumInPair)
                try:
                    ##                    pos=int(primersInfo[primerPairNum][5+primerNumInPair*4])+primerNumInPair
                    pos = int(primersInfo[primerPairNum][5 + primerNumInPair])
                except IndexError:
                    print('ERROR (2): Incorrect index:')
                    print(primersInfo)
                    print(primersName)
                    print(primerPairNum)
                    print(primerNumInPair)
                    exit(2)
                try:
                    targetRegion = [
                        primersInfo[primerPairNum][4], infoStrand * pos
                    ]
                except TypeError:
                    print('ERROR!', primersInfo[primerPairNum][4],
                          primersInfo[primerPair][0], primerNumInPair)
                    exit(15)
                break
    else:
        targetRegion = []
    if strand == -1:
        mainMapping = [
            read.reference_name, strand * (read.pos + len(read.qname))
        ]
    else:
        mainMapping = [read.reference_name, strand * (read.pos + 1)]
    if (mainMapping != targetRegion):
        ## The next string is necessary for human genomes
        ## to exclude unsorted chromosome fragments from the analysis
        ##        and '_' not in read.reference_name
        nonSpecRegions = [
            ','.join([
                read.reference_name,
                str(strand * (read.pos + 1)), read.cigarstring,
                str(read.get_tag('NM'))
            ])
        ]
    else:
        nonSpecRegions = []
    if read.has_tag('XA'):
        # XA tag of read ends with ; so the last element is empty
        nonSpecRegions.extend(read.get_tag('XA').split(';')[:-1])
    qname = read.qname
    if len(nonSpecRegions) > maxPrimerNonspec:
        return (qname, nonSpecRegions)
    primerNonSpecRegions = []
    for region in nonSpecRegions:
        # We go through all these regions and check
        # that 3'-nucleotide matches primer's 3'-end
        chrom, pos, cigar, subst = region.split(',')
        # Determine length of sequence of interest by parsing CIGAR.
        # The length depends only on the deletions
        # but not mismatches nor insertions into reference genome
        # So we count number of deletions
        indelsMatch = indelsPat.findall(cigar)
        matchMatch = matchPat.findall(cigar)
        if len(indelsMatch) == 0:
            regionLen = int(matchMatch[0])
        else:
            sumDeletions = 0
            sumMatch = 0
            for m in indelsMatch:
                if m[1] == 'D':
                    sumDeletions += int(m[0])
            for m in matchMatch:
                sumMatch += int(m)
            regionLen = sumMatch + sumDeletions
        if int(pos) < 0:
            pos2 = -(int(pos) + regionLen)
        else:
            pos2 = int(pos)
        if ([chrom, int(pos)] == targetRegion
                or (len(targetRegion) > 0 and chrom == targetRegion[0]
                    and abs(targetRegion[1] - pos2) <= len(read.seq))):
            ##            print([chrom,int(pos)])
            continue
        attempts = 0
        seq = None
        while (seq is None):
            try:
                seq = refFa.fetch(region=chrom + ':'
                                  '' + str(abs(int(pos))) + '-'
                                  '' + str(abs(int(pos)) + regionLen - 1))
            except:
                seq = None
                attempts += 1
                if attempts >= 10:
                    print('ERROR!')
                    ##                    logger.error(str(e))
                    print(refFa.filename)
                    logger.error(refFa.filename)
                    print(chrom, pos, regionLen)
                    exit(1)
        # Determine, which sequence we should take:
        # forward or reverse-complement
        # If primer is on + strand and found region is on opposite
        # or primer is on - strand and found region is on the same
        # we take reverse-complement
        if int(pos) > 0:
            regStrand = 1
        elif int(pos) < 0:
            regStrand = -1
        if (strand > 0 and int(pos) < 0) or (strand < 0 and int(pos) > 0):
            try:
                seq = str(Seq.Seq(seq).reverse_complement()).upper()
            # If there is an error like 'Mixed RNA/DNA found'
            except ValueError as e:
                if 'Mixed RNA/DNA found' in str(e):
                    print('ERROR (1): Mixed RNA/DNA found:')
                    print(seq)
                    exit(1)
                else:
                    print('ERROR: Unknown ValueError!')
                    exit(0)
        else:
            seq = seq.upper()
        # Check if read sequence is the same as read qname
        if read.qname == read.seq:
            primerSeq = read.seq
            regionSeq = seq
        else:
            primerSeq = read.qname
            regionSeq = revComplement(seq)
        if len(regionSeq) == 1:
            continue
        # If there is some insertions or deletion in found region
        if 'I' in cigar or 'D' in cigar:
            # We need to align its sequence with primer sequence
            align = pairwise2.align.globalxx(primerSeq, regionSeq)
            # Check that 3'-ends of primers are identical
            ## and one of two nucleotides before 3'-ends are identical, too
            if (align[0][0][-1] == align[0][1][-1]
                    or align[0][0][-2] == align[0][1][-2]):
                # Then we consider this region as a non-specific
                # for this primer
                primerNonSpecRegions.append(
                    [chrom, regStrand,
                     abs(int(pos)), regionLen])
        else:
            try:
                if (regionSeq[-1] == primerSeq[-1]
                        or regionSeq[-2] == primerSeq[-2]):
                    # Then we consider this region as a non-specific for this primer
                    primerNonSpecRegions.append(
                        [chrom, regStrand,
                         abs(int(pos)), regionLen])
            except IndexError:
                print('ERROR (2): incorrect index for sequences:')
                print('regionSeq:', regionSeq)
                print('primerSeq:', primerSeq)
                print('chr:', chrom)
                print('position:', pos)
                print('regionLen:', regionLen)
                print(-1, -2)
                exit(2)
    return (read.qname, primerNonSpecRegions)
        seq = seq[::-1]
    BaseToLeftIsNoCoverage = False
    ResultingSeq = ''
    for base in seq:
        if base == NoCoverageChar:
            BaseToLeftIsNoCoverage = True
            ResultingSeq += NoCoverageChar
        elif base == GapChar:
            if BaseToLeftIsNoCoverage:
                ResultingSeq += NoCoverageChar
            else:
                ResultingSeq += GapChar
        else:
            BaseToLeftIsNoCoverage = False
            ResultingSeq += base
    if LeftToRightDone:
        ResultingSeq = ResultingSeq[::-1]
    else:
        ResultingSeq = PropagateNoCoverageChar(ResultingSeq, True)
    return ResultingSeq


seqs = []
for seq in SeqIO.parse(open(args.FastaFile), 'fasta'):
    SeqAsString = str(seq.seq)
    SeqAsString = PropagateNoCoverageChar(SeqAsString)
    seq.seq = Seq.Seq(SeqAsString)
    seqs.append(seq)

SeqIO.write(seqs, sys.stdout, "fasta")
示例#11
0
def calculate_free_energy(seq,
                          check=True,
                          strict=True,
                          c_seq=None,
                          shift=0,
                          nn_table=RNA_NN3,
                          tmm_table=DNA_TMM1,
                          imm_table=DNA_IMM1,
                          de_table=RNA_DE2,
                          dnac1=25,
                          dnac2=0,
                          selfcomp=False,
                          Na=20,
                          K=50,
                          Tris=0,
                          Mg=0,
                          dNTPs=0,
                          saltcorr=5):
    """Return the delatG using nearest neighbor thermodynamics."""
    #print shift
    print seq
    seq = str(seq)
    if not c_seq:
        # c_seq must be provided by user if dangling ends or mismatches should
        # be taken into account. Otherwise take perfect complement.
        c_seq = Seq.Seq(seq).complement()
    c_seq = str(c_seq)
    if check:
        seq = _check(seq, 'Tm_NN')
        c_seq = _check(c_seq, 'Tm_NN')
    tmpseq = seq
    tmp_cseq = c_seq
    deltaH = 0
    deltaS = 0
    dH = 0  # Names for indexes
    dS = 1  # 0 and 1
    #print tmpseq, tmp_cseq
    # Dangling ends?
    if shift or len(seq) != len(c_seq):
        # Align both sequences using the shift parameter
        if shift > 0:
            tmpseq = '.' * shift + seq
        if shift < 0:
            tmp_cseq = '.' * abs(shift) + c_seq

        if len(tmp_cseq) > len(tmpseq):
            tmpseq += (len(tmp_cseq) - len(tmpseq)) * '.'
        if len(tmp_cseq) < len(tmpseq):
            tmp_cseq += (len(tmpseq) - len(tmp_cseq)) * '.'
        # Remove 'over-dangling' ends
        while tmpseq.startswith('..') or tmp_cseq.startswith('..'):
            tmpseq = tmpseq[1:]
            tmp_cseq = tmp_cseq[1:]
        while tmpseq.endswith('..') or tmp_cseq.endswith('..'):
            tmpseq = tmpseq[:-1]
            tmp_cseq = tmp_cseq[:-1]
        #print tmpseq, tmp_cseq
        # Now for the dangling ends
        if tmpseq.startswith('.') or tmp_cseq.startswith('.'):
            left_de = tmpseq[:2] + '/' + tmp_cseq[:2]
            #print 'left ', left_de
            deltaH += de_table[left_de][dH]
            deltaS += de_table[left_de][dS]
            tmpseq = tmpseq[1:]
            tmp_cseq = tmp_cseq[1:]
        if tmpseq.endswith('.') or tmp_cseq.endswith('.'):
            right_de = tmp_cseq[-2:][::-1] + '/' + tmpseq[-2:][::-1]
            deltaH += de_table[right_de][dH]
            deltaS += de_table[right_de][dS]
            tmpseq = tmpseq[:-1]
            tmp_cseq = tmp_cseq[:-1]
    # Now for terminal mismatches
    left_tmm = tmp_cseq[:2][::-1] + '/' + tmpseq[:2][::-1]
    if left_tmm in tmm_table:
        deltaH += tmm_table[left_tmm][dH]
        deltaS += tmm_table[left_tmm][dS]
        tmpseq = tmpseq[1:]
        tmp_cseq = tmp_cseq[1:]
    right_tmm = tmpseq[-2:] + '/' + tmp_cseq[-2:]
    if right_tmm in tmm_table:
        deltaH += tmm_table[right_tmm][dH]
        deltaS += tmm_table[right_tmm][dS]
        tmpseq = tmpseq[:-1]
        tmp_cseq = tmp_cseq[:-1]

    # Now everything 'unusual' at the ends is handled and removed and we can
    # look at the initiation.
    # One or several of the following initiation types may apply:

    # Type: General initiation value
    deltaH += nn_table['init'][dH]
    deltaS += nn_table['init'][dS]

    # Type: Duplex with no (allA/T) or at least one (oneG/C) GC pair
    if SeqUtils.GC(seq) == 0:
        deltaH += nn_table['init_allA/T'][dH]
        deltaS += nn_table['init_allA/T'][dS]
    else:
        deltaH += nn_table['init_oneG/C'][dH]
        deltaS += nn_table['init_oneG/C'][dS]

    # Type: Penalty if 5' end is T
    if seq.startswith('T'):
        deltaH += nn_table['init_5T/A'][dH]
        deltaS += nn_table['init_5T/A'][dS]
    if seq.endswith('A'):
        deltaH += nn_table['init_5T/A'][dH]
        deltaS += nn_table['init_5T/A'][dS]

    # Type: Different values for G/C or A/T terminal basepairs
    ends = seq[0] + seq[-1]
    AT = ends.count('A') + ends.count('T')
    GC = ends.count('G') + ends.count('C')
    deltaH += nn_table['init_A/T'][dH] * AT
    deltaS += nn_table['init_A/T'][dS] * AT
    deltaH += nn_table['init_G/C'][dH] * GC
    deltaS += nn_table['init_G/C'][dS] * GC

    # Finally, the 'zipping'
    for basenumber in range(len(tmpseq) - 1):
        neighbors = tmpseq[basenumber:basenumber + 2] + '/' + \
            tmp_cseq[basenumber:basenumber + 2]
        #print neighbors
        if neighbors in imm_table:
            deltaH += imm_table[neighbors][dH]
            deltaS += imm_table[neighbors][dS]
        elif neighbors[::-1] in imm_table:
            deltaH += imm_table[neighbors[::-1]][dH]
            deltaS += imm_table[neighbors[::-1]][dS]
        elif neighbors in nn_table:
            deltaH += nn_table[neighbors][dH]
            deltaS += nn_table[neighbors][dS]
        elif neighbors[::-1] in nn_table:
            deltaH += nn_table[neighbors[::-1]][dH]
            deltaS += nn_table[neighbors[::-1]][dS]
        else:
            # We haven't found the key...
            if strict:
                raise ValueError('no data for neighbors \'' + neighbors + '\'')
            else:
                warnings.warn(
                    'no data for neighbors \'' + neighbors +
                    '\'. Calculation will be wrong', BiopythonWarning)

    k = (dnac1 - (dnac2 / 2.0)) * 1e-9
    if selfcomp:
        k = dnac1 * 1e-9
        deltaH += nn_table['sym'][dH]
        deltaS += nn_table['sym'][dS]
    R = 1.987  # universal gas constant in Cal/degrees C*Mol
    if saltcorr:
        corr = salt_correction(Na=Na,
                               K=K,
                               Tris=Tris,
                               Mg=Mg,
                               dNTPs=dNTPs,
                               method=saltcorr,
                               seq=seq)
    if saltcorr == 5:
        deltaS += corr

    tao = 273.15 + 22  # Constant temperature tao in Kelvin
    deltaG = (deltaH * 1000 - tao * deltaS) / 1000
    #print deltaG

    Tm = (1000 * deltaH) / (deltaS + (R * (math.log(k)))) - 273.15

    if saltcorr in (1, 2, 3, 4):
        Tm += corr
    if saltcorr in (6, 7):
        Tm = (1 / (1 / (Tm + 273.15) + corr) - 273.15)

    return deltaG
示例#12
0
from Bio import SeqIO
from Bio import Seq

CENPB = Seq.Seq("NTTCGNNNNANNCGGGN")
CENPB_3CtoT = Seq.Seq("NTTTGNNNNANNCGGGN")
CENPB_8GtoA = Seq.Seq("NTTCGNNNNANNCGAGN")


def check_CENPB(inputf, inputfmt):
    rec_iter = SeqIO.parse(inputf, inputfmt)
    while True:
        try:
            seq_record = next(rec_iter)
        except StopIteration:
            break
        else:
            i = 0
            while True:
                seq = seq_record.seq[i:i + 17]
                if len(seq) < 17:
                    break
                elif (seq[1:5] == CENPB[1:5] and seq[9] == CENPB[9]
                      and seq[12:16] == CENPB[12:16]):
                    yield [seq_record.id, i, seq, seq_record.description]
                    i += 1
                else:
                    i += 1


def check_CENPBrev(inputf, inputfmt):
    rec_iter = SeqIO.parse(inputf, inputfmt)
示例#13
0
def density_adjusted(fname, chr_sam, minlength, maxlength, path_wig, path_den,
                     path_gff):
    '''Density will be a size separated dictionary = {length : [reads at 0, reads at 1, ....]}
        this makes it easier to select a size range later for analysis
        
        adjusted: will shift reads larger than 24 to alignn 3' end'''

    fname = fname
    chr_sam = chr_sam
    minlength = minlength
    maxlength = maxlength
    GFFgen = GFF.parse(path_gff)

    # open chr aligned sam file
    f_samfile = open(chr_sam)
    samfile = csv.reader(f_samfile, delimiter='	')

    # dictionaries to hold read counts
    density_plus = {}
    density_minus = {}
    density_plus_sizesep = {}
    density_minus_sizesep = {}

    if minlength < 0 or maxlength < 0:
        print "Error. Length input not valid."
        return (0)

    # Makes 2 sets of indices, one for all reads, and another for size separated:
    for sequence in GFFgen:
        density_plus[sequence.id] = [0 for x in range(len(sequence) + 20)]
        density_minus[sequence.id] = [0 for x in range(len(sequence) + 20)]

    for length in range(minlength, maxlength + 1):
        density_plus_sizesep[length] = [0 for x in range(len(sequence) + 20)]
        density_minus_sizesep[length] = [0 for x in range(len(sequence) + 20)]

    total_reads = 0
    mapped_reads = 0

    # Loop through the samfile.
    for read in samfile:
        if read[0][0] == '@':  # Ignore header lines.
            continue

        if read[1] == '4':  # A bowtie mismatch.
            continue

        chrom = read[2]  # chromosome identified for read in bowtie
        readid = read[0]  # read id
        startp = int(
            read[3]
        ) - 1  # start position. Need to subtract 1 since genomic sequence starts at 1,
        seq = Seq.Seq(read[9])  # sequence of the read
        length = len(seq)  # length of read

        if length < 23:
            length_shift = 24 - length
        else:
            length_shift = 0

        if chrom not in density_plus.keys():
            print "Error: Bowtie index and GFF do not match"

        total_reads += 1

        # Note that Bowtie reverse complements any sequence aligning to the reverse strand.
        # and so read[3] is the 3'-end of minus strand reads

        # Filter to get rid of reads of particular length. Or a particular strand.
        if (length < minlength or length > maxlength):
            continue

        mapped_reads += 1

        # 16 is the minus strand, 0 is the plus strand
        if (read[1] == '16'):
            start = startp - length_shift
            density_minus[chrom][start] += 1
            density_minus_sizesep[length][start] += 1

        if (read[1] == '0'):
            start = startp + length - 1 + length_shift
            density_plus[chrom][start] += 1
            density_plus_sizesep[length][start] += 1

    path_oldformat = path_den + "binary/"
    if not os.path.exists(path_oldformat):
        os.makedirs(path_oldformat)

    density_plus[sequence.id] = [
        float(i) * 1000000 / float(mapped_reads)
        for i in density_plus[sequence.id]
    ]
    density_minus[sequence.id] = [
        float(i) * 1000000 / float(mapped_reads)
        for i in density_minus[sequence.id]
    ]

    ribo_util.writebin(density_plus, path_oldformat + fname + "_plus_")
    ribo_util.makePickle(density_plus, path_den + "plus")
    ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep")
    ribo_util.countstowig(density_plus, path_wig + "_plus")

    ribo_util.writebin(density_minus, path_oldformat + fname + "_minus_")
    ribo_util.makePickle(density_minus, path_den + "minus")
    ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep")
    ribo_util.countstowig(density_minus, path_wig + "_minus")
                    #print(annotation[annotation.sseqid == ID ].iloc[0])
                    sub = annotation[annotation.sseqid == ID]
                    rows.append(sub.index[0])
                annotation = annotation.loc[rows]
                rows = []
                for ID in annotation.qstart.unique():
                    #print(annotation[annotation.sseqid == ID ].iloc[0])
                    sub = annotation[annotation.qstart == ID]
                    rows.append(sub.index[0])
                annotation = annotation.loc[rows]
                genes = {}
                prots = {}
                for i, r in annotation.iterrows():
                    genes[i] = qseq[r.qstart - 1:r.qend - 1].decode()
                    #print(genes[i])
                    prots[i] = str(Seq.Seq(genes[i]).translate())
                annotation = annotation.sort_values(['qstart'])
                annotation['prots'] = annotation.index.map(prots)
                annotation['genes'] = annotation.index.map(genes)

                aln_regions = np.array(
                    list(zip(list(annotation.qstart), list(annotation.qend))))
                aln_regions = aln_regions[1:, :]
                aln_len = np.array(list(annotation.qend - annotation.qstart))

                annotation = pd.DataFrame.sort_values(annotation, by='qstart')
                print(annotation)
                annotation.to_csv(alnfile + 'annotation.csv')

    else:
        #just seperate sequence into dummy codons
        #
        pssm = m.counts.normalize(pseudocounts=0.1).log_odds()
        cons_score = pssm.calculate(cons)
        cons_list = list(cons)
        cons_str = str(cons)
        deg_cons_str = str(deg_cons)

        #
        # for each position, generate a new test sequence for each possible nucleotide
        # at that position. Then score that test sequence relative to the original pssm.
        # Next, evaluate the absolute value of the score difference between every pair of
        # test sequence and classify each pair as either a transition or transversion.
        #
        for i, c in enumerate(cons_list):
            new_cons_str_A = Seq.Seq(
                "".join((cons_str[0:i], "A", cons_str[i + 1:])),
                IUPAC.unambiguous_dna)
            new_cons_str_C = Seq.Seq(
                "".join((cons_str[0:i], "C", cons_str[i + 1:])),
                IUPAC.unambiguous_dna)
            new_cons_str_G = Seq.Seq(
                "".join((cons_str[0:i], "G", cons_str[i + 1:])),
                IUPAC.unambiguous_dna)
            new_cons_str_T = Seq.Seq(
                "".join((cons_str[0:i], "T", cons_str[i + 1:])),
                IUPAC.unambiguous_dna)
            new_score_A = pssm.calculate(new_cons_str_A)
            new_score_C = pssm.calculate(new_cons_str_C)
            new_score_G = pssm.calculate(new_cons_str_G)
            new_score_T = pssm.calculate(new_cons_str_T)
            central_distance = 2 * (0.5 - float(i) / len(counts[1, :]))
示例#16
0
def mk_detect(tree_filename, ali_basename, OutDirName):
    start_detec = time.time()
    metadata_simu_dico = {}
    logger.debug("Tree: %s", os.path.basename(tree_filename))
    metadata_simu_dico["tree"] = os.path.basename(tree_filename)

    g_tree = events_placing.gene_tree(tree_filename, manual_mode_nodes)
    g_tree.init_inter_dir_det(repest0, reptree0, repfasta0, repbppconfig,
                              repseq)
    g_tree.auto_trim_tree = auto_trim_tree
    g_tree.init_tree_det(n_sites)

    metadata_simu_dico["numberOfLeaves"] = g_tree.numberOfLeafs

    if g_tree.manual_mode_nodes["T"] == []:
        logger.warning("No transition in the tree. End.")

    else:

        logger.debug("repfasta: %s", g_tree.repfasta)
        logger.debug("repest: %s", g_tree.repest)
        logger.debug("reptree: %s", g_tree.reptree)

        ### construit les arbres d'etude :
        (allbranchlength,
         convbranchlength) = g_tree.mk_tree_for_simu(plot=args.plot)

        metadata_simu_dico["allbranchlength"] = allbranchlength
        metadata_simu_dico["convbranchlength"] = convbranchlength

        l_TPFPFNTN_mod_het = []
        l_TPFPFNTN_topo = []
        l_TPFPFNTN_obs_sub = []

        if not os.path.isfile(g_tree.repseq + "/" + ali_basename):
            logger.error("%s does not exist",
                         g_tree.repseq + "/" + ali_basename)
            sys.exit(1)

        c1 = 1  # useless but compatibility
        c2 = 2  # useless but compatibility

        set_e1e2 = []
        for e1 in range(1, (NbCat_Est + 1)):
            for e2 in range(1, (NbCat_Est + 1)):
                set_e1e2.append((e1, e2))
        for (e1, e2) in set_e1e2:
            logger.debug("Estime e1: %s e2: %s", e1, e2)
            # Positif
            bpp_lib.make_estim(ali_basename,
                               e1,
                               e2,
                               g_tree,
                               NBCATest=NbCat_Est,
                               suffix="_noOneChange",
                               OneChange=False,
                               ext="",
                               max_gap_allowed=args.max_gap_allowed,
                               gamma=args.gamma,
                               inv_gamma=args.inv_gamma)
            bpp_lib.make_estim(ali_basename,
                               e1,
                               e2,
                               g_tree,
                               NBCATest=NbCat_Est,
                               suffix="_withOneChange",
                               OneChange=True,
                               ext="",
                               max_gap_allowed=args.max_gap_allowed,
                               gamma=args.gamma,
                               inv_gamma=args.inv_gamma)

        ### post proba
        res, bilan = estim_data.dico_typechg_het_det(ali_basename,
                                                     g_tree,
                                                     set_e1e2=set_e1e2,
                                                     NbCat_Est=NbCat_Est,
                                                     ID=date)
        l_TPFPFNTN_mod_het.extend(res)

        for p in [
                "p_max_OX_OXY", "p_max_XY_OXY", "p_mean_OX_OXY",
                "p_mean_XY_OXY"
        ]:
            if bilan[12].has_key(p):
                del bilan[12][p]

        dict_values_pcoc = {}
        dict_values_pcoc["PCOC"] = bilan[12]["p_mean_X_OXY"]
        dict_values_pcoc["PC"] = bilan[12]["p_mean_X_XY"]
        dict_values_pcoc["OC"] = bilan[12]["p_mean_X_OX"]

        ### Get indel prop
        prop_indel = [0] * n_sites
        prop_indel_conv = [0] * n_sites
        for seq in ali:
            sp_conv = g_tree.annotated_tree.search_nodes(
                name=seq.name)[0].C == True
            for i in range(n_sites):
                if seq.seq[i] == "-":
                    prop_indel[i] += 1
                    if sp_conv:
                        prop_indel_conv[i] += 1

        # filter position:

        bilan_f = {}
        all_pos = range(1, n_sites + 1)

        # filter on indel prop:
        t_indel = args.max_gap_allowed_in_conv_leaves * float(
            g_tree.numberOfConvLeafs)
        all_pos_without_indel_sites = [
            p for p in all_pos if prop_indel_conv[p - 1] < t_indel
        ]

        dict_pos_filtered = {}
        for model in ["PCOC", "PC", "OC"]:
            dict_pos_filtered[model] = [
                p for p in all_pos_without_indel_sites
                if dict_values_pcoc[model][p -
                                           1] >= dict_p_filter_threshold[model]
            ]
            if positions_to_highlight:
                dict_pos_filtered[model].extend(positions_to_highlight)
                dict_pos_filtered[model] = list(set(dict_pos_filtered[model]))
                dict_pos_filtered[model].sort()

        # filter dict_values_pcoc
        dict_values_pcoc_filtered = {}
        all_filtered_position = list(
            set(events_placing.unlist(dict_pos_filtered.values())))
        all_filtered_position.sort()
        dict_pos_filtered["union"] = all_filtered_position

        if args.reorder:
            for model in ["PCOC", "PC", "OC", "union"]:
                m_list = [model]
                if model == "union":
                    m_list = ["PCOC", "PC", "OC"]
                nb_filtered_pos = len(dict_pos_filtered[model])
                new_order = [0] * nb_filtered_pos
                j = 0
                # 0.99
                for i in range(nb_filtered_pos):
                    p = dict_pos_filtered[model][i]
                    if any(
                        [dict_values_pcoc[m][p - 1] >= 0.99 for m in m_list]):
                        new_order[i] = j
                        j += 1
                # 0.9
                for i in range(nb_filtered_pos):
                    p = dict_pos_filtered[model][i]
                    if any([0.99 > dict_values_pcoc[m] [p-1] >= 0.9 for m in m_list]) and \
                       all([0.99 > dict_values_pcoc[m] [p-1] for m in m_list]):
                        new_order[i] = j
                        j += 1
                # 0.8
                for i in range(nb_filtered_pos):
                    p = dict_pos_filtered[model][i]
                    if any([ 0.9 > dict_values_pcoc[m] [p-1] >= 0.8 for m in m_list]) and \
                       all([ 0.9 > dict_values_pcoc[m] [p-1] for m in m_list]):
                        new_order[i] = j
                        j += 1
                # other
                for i in range(nb_filtered_pos):
                    p = dict_pos_filtered[model][i]
                    if all([dict_values_pcoc[m][p - 1] < 0.8 for m in m_list]):
                        new_order[i] = j
                        j += 1
                dict_pos_filtered[model] = reorder_l(dict_pos_filtered[model],
                                                     new_order)

        # filtered ali:
        ## Per model
        for model in ["PCOC", "PC", "OC", "union"]:
            filtered_ali = []
            for seq in ali:
                new_seq = SeqRecord.SeqRecord(
                    Seq.Seq("".join(
                        filter_l(list(seq.seq), dict_pos_filtered[model]))),
                    seq.id, "", "")
                filtered_ali.append(new_seq)
            SeqIO.write(filtered_ali,
                        g_tree.repfasta + "/filtered_ali." + model + ".faa",
                        "fasta")
            if model == "union":
                modelstr = "union"
            else:
                modelstr = model
            logger.info("%s model: # filtered position: %s/%s",
                        modelstr.upper(), len(dict_pos_filtered[model]),
                        n_sites)

        ## Output

        ### Table
        #### complete:
        df_bilan = pd.DataFrame.from_dict(dict_values_pcoc,
                                          orient='columns',
                                          dtype=None)
        df_bilan["Sites"] = all_pos
        df_bilan["Indel_prop"] = prop_indel
        df_bilan["Indel_prop"] = df_bilan["Indel_prop"] / nb_seq
        df_bilan["Indel_prop(ConvLeaves)"] = prop_indel_conv
        df_bilan["Indel_prop(ConvLeaves)"] = df_bilan[
            "Indel_prop(ConvLeaves)"] / float(g_tree.numberOfConvLeafs)
        df_bilan = df_bilan[[
            "Sites", "Indel_prop", "Indel_prop(ConvLeaves)", "PCOC", "PC", "OC"
        ]]
        #### filtered:
        df_bilan_f = df_bilan[df_bilan.Sites.isin(all_filtered_position)]
        df_bilan_f = df_bilan_f.copy()

        df_bilan.to_csv(prefix_out + ".results.tsv", index=False, sep='\t')
        if not df_bilan_f.empty:
            df_bilan_f.to_csv(prefix_out + ".filtered_results.tsv",
                              index=False,
                              sep='\t')

        ### Plot
        if args.plot:
            if args.plot_complete_ali:
                plot_data.make_tree_ali_detect_combi(
                    g_tree,
                    g_tree.repseq + "/" + ali_basename,
                    prefix_out + "_plot_complete.pdf",
                    dict_benchmark=dict_values_pcoc,
                    hp=positions_to_highlight,
                    title=args.plot_title)
                if args.svg:
                    plot_data.make_tree_ali_detect_combi(
                        g_tree,
                        g_tree.repseq + "/" + ali_basename,
                        prefix_out + "_plot_complete.svg",
                        dict_benchmark=dict_values_pcoc,
                        hp=positions_to_highlight,
                        title=args.plot_title)

            for model in ["PCOC", "PC", "OC"]:
                if dict_pos_filtered[
                        model] and dict_p_filter_threshold[model] <= 1:
                    dict_values_pcoc_filtered_model = {}
                    for (key, val) in dict_values_pcoc.items():
                        dict_values_pcoc_filtered_model[key] = filter_l(
                            val, dict_pos_filtered[model])
                    plot_data.make_tree_ali_detect_combi(
                        g_tree,
                        g_tree.repfasta + "/filtered_ali." + model + ".faa",
                        prefix_out + "_plot_filtered_" + model + ".pdf",
                        hist_up=model,
                        dict_benchmark=dict_values_pcoc_filtered_model,
                        x_values=dict_pos_filtered[model],
                        hp=positions_to_highlight,
                        reorder=args.reorder,
                        det_tool=True,
                        title=args.plot_title)
                    if args.svg:
                        plot_data.make_tree_ali_detect_combi(
                            g_tree,
                            g_tree.repfasta + "/filtered_ali." + model +
                            ".faa",
                            prefix_out + "_plot_filtered_" + model + ".svg",
                            hist_up=model,
                            dict_benchmark=dict_values_pcoc_filtered_model,
                            x_values=dict_pos_filtered[model],
                            hp=positions_to_highlight,
                            reorder=args.reorder,
                            det_tool=True,
                            title=args.plot_title)

            # all model
            if dict_pos_filtered["union"]:
                model = "union"
                dict_values_pcoc_filtered_model = {}
                for (key, val) in dict_values_pcoc.items():
                    dict_values_pcoc_filtered_model[key] = filter_l(
                        val, dict_pos_filtered[model])
                plot_data.make_tree_ali_detect_combi(
                    g_tree,
                    g_tree.repfasta + "/filtered_ali." + model + ".faa",
                    prefix_out + "_plot_filtered_" + model + ".pdf",
                    dict_benchmark=dict_values_pcoc_filtered_model,
                    x_values=dict_pos_filtered[model],
                    hp=positions_to_highlight,
                    reorder=False,
                    det_tool=True,
                    title=args.plot_title)
                if args.svg:
                    plot_data.make_tree_ali_detect_combi(
                        g_tree,
                        g_tree.repfasta + "/filtered_ali." + model + ".faa",
                        prefix_out + "_plot_filtered_" + model + ".svg",
                        dict_benchmark=dict_values_pcoc_filtered_model,
                        x_values=dict_pos_filtered[model],
                        hp=positions_to_highlight,
                        reorder=False,
                        det_tool=True,
                        title=args.plot_title)

    if not args.no_cleanup:
        remove_folder(g_tree.repest)
        remove_folder(g_tree.repbppconfig)
        remove_folder(g_tree.reptree)
        if not args.no_cleanup_fasta:
            remove_folder(g_tree.repfasta)

    metadata_simu_dico["time"] = str(time.time() - start_detec)
示例#17
0
def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file,
                         amp_seq_fasta, outfilename):
    #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns:
    #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted)

    sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath +
                        trg_umi_cluster_file)
    trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        trg_umi_cluster_file)
    #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]}

    trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU")
    amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU")
    realign_amplicons = False
    amp_match_handle = None
    try:
        sysOps.throw_status('Loading ' + sysOps.globaldatapath +
                            amp_match_file)
        amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU")
    except:
        sysOps.throw_status(
            sysOps.globaldatapath + amp_match_file +
            ' not found. Alignments will occur from sequence-consenses directly.'
        )
        realign_amplicons = True
        if not sysOps.check_file_exists('amplicon_refs.txt'):
            sysOps.throw_exception('Error: ' + sysOps.globaldatapath +
                                   'amplicon_refs.txt not found.')
            sysOps.exitProgram()

    trg_umi_dict = dict()
    trg_amp_seq_dict = dict()

    for trg_umi_record, amp_seq_record in itertools.izip(
            SeqIO.parse(trg_umi_handle, "fasta"),
            SeqIO.parse(amp_seq_handle, "fasta")):

        if not realign_amplicons:
            amp_match = int(amp_match_handle.readline().strip('\n'))
        else:
            amp_match = -1

        trg_umi_seq = str(trg_umi_record.seq)
        if trg_umi_seq in trg_umi_cluster_dict:
            trg_umi_index = str(
                trg_umi_cluster_dict[trg_umi_seq][0])  #uxi cluster-index
            if trg_umi_index in trg_umi_dict:
                if amp_match in trg_umi_dict[trg_umi_index]:
                    trg_umi_dict[trg_umi_index][
                        amp_match] += 1  #add 1, because every read is being entered
                else:
                    trg_umi_dict[trg_umi_index][amp_match] = 1
            else:
                trg_umi_dict[trg_umi_index] = dict()
                trg_amp_seq_dict[trg_umi_index] = baseTally()
                trg_umi_dict[trg_umi_index][amp_match] = 1

            trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq),
                                                       1)

    trg_umi_handle.close()
    amp_seq_handle.close()
    if not realign_amplicons:
        amp_match_handle.close()

    csvfile = open(sysOps.globaldatapath + outfilename, 'w')
    fastafile = open(
        sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] +
        '.fasta', 'w')
    ref_sequences = list()
    if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'):
        with open(sysOps.globaldatapath + 'amplicon_refs.txt',
                  'rU') as ref_file_handle:
            for ref_line in ref_file_handle:
                [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|')
                # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order
                ref_sequences.append([
                    str(Seq.Seq(my_ref_seq).reverse_complement())
                    for my_ref_seq in reversed(ref_seq.split(','))
                ])
        mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt')
        max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0])
        trg_umi_index_dict = dict()

    accepted_consensus_sequences = 0
    inadmis_consensus_sequences = 0
    for trg_umi_index in trg_umi_dict:
        max_tally = 0
        tot_tally = 0

        for amp_match in trg_umi_dict[trg_umi_index]:

            my_tally = trg_umi_dict[trg_umi_index][amp_match]

            if my_tally >= max_tally:
                max_tally = int(my_tally)
                max_match = int(amp_match)

            tot_tally += int(my_tally)

        consensus_seq = str(
            trg_amp_seq_dict[trg_umi_index].get_str_consensus())

        if realign_amplicons:
            # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match
            max_match = -1
            max_tally = -1  # exclude max_tally as count, since alignment is happening post-consensus
            min_mismatch_count = -1
            for i in range(len(ref_sequences)):
                all_subamplicons_pass = True
                start_index = 0
                tot_mismatches = 0
                for j in range(len(ref_sequences[i])
                               ):  # loop through sub-amplicon-sequences
                    ref_subamplicon_len = len(ref_sequences[i][j])
                    my_mismatches, minlen = alignOps.count_mismatches(
                        ref_sequences[i][j],
                        consensus_seq[start_index:(start_index +
                                                   ref_subamplicon_len)])
                    if minlen == 0:
                        all_subamplicons_pass = False
                        break
                    all_subamplicons_pass = all_subamplicons_pass and (
                        my_mismatches / float(minlen) <= max_mismatch_amplicon)
                    start_index += ref_subamplicon_len
                    tot_mismatches += my_mismatches
                if all_subamplicons_pass and (
                        max_match < 0 or min_mismatch_count < tot_mismatches):
                    max_match = int(i)
                    min_mismatch_count = int(tot_mismatches)

        if max_match >= 0:
            csvfile.write(trg_umi_index + "," + str(max_match) + "," +
                          str(max_tally) + "," + str(tot_tally) + "\n")
            fastafile.write(">" + trg_umi_index + '\n')
            fastafile.write(consensus_seq + '\n')
            if realign_amplicons:
                trg_umi_index_dict[trg_umi_index] = True
            accepted_consensus_sequences += 1
        else:
            inadmis_consensus_sequences += 1

    csvfile.close()
    fastafile.close()
    sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' +
                        str(accepted_consensus_sequences +
                            inadmis_consensus_sequences) +
                        ' sequences in writing ' + sysOps.globaldatapath +
                        outfilename + ' due to inadequate amplicon match.')

    if realign_amplicons:
        # create a new consensus pairing file that's filtered with the accepted trg umi indices
        [dirnames, filenames] = sysOps.get_directory_and_file_list()
        consensus_filenames = [
            filename for filename in filenames
            if filename.startswith('consensus')
        ]
        for consensus_filename in consensus_filenames:  # find all consensus files present
            accepted_consensus_sequences = 0
            inadmis_consensus_sequences = 0
            os.rename(
                sysOps.globaldatapath + consensus_filename,
                sysOps.globaldatapath + 'unfiltered_' + consensus_filename)
            with open(sysOps.globaldatapath + consensus_filename,
                      'w') as new_consensus_file:
                with open(
                        sysOps.globaldatapath + 'unfiltered_' +
                        consensus_filename, 'rU') as old_consensus_file:
                    for old_consensus_file_line in old_consensus_file:
                        consensus_list = old_consensus_file_line.strip(
                            '\n'
                        ).split(
                            ','
                        )  # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)]
                        if consensus_list[2] in trg_umi_index_dict:
                            new_consensus_file.write(old_consensus_file_line)
                            accepted_consensus_sequences += 1
                        else:
                            inadmis_consensus_sequences += 1
            sysOps.throw_status('Discarded ' +
                                str(inadmis_consensus_sequences) + '/' +
                                str(accepted_consensus_sequences +
                                    inadmis_consensus_sequences) +
                                ' consensus-pairings in writing ' +
                                sysOps.globaldatapath + consensus_filename +
                                ' due to inadequate amplicon match.')
        if len(consensus_filenames) == 0:
            sysOps.throw_exception(
                'Error: no consensus files available to update with realigned amplicon information. Exiting.'
            )
            sysOps.exitProgram()
示例#18
0
            insertions = [[pos, '', 1-total_freq]]
            for insertion, c in list(ins[ref][pos].items()):
                ins_freq = 1.0*c.sum()/cov[pos]
                insertions.append([pos, insertion, ins_freq])
            insertions.sort(key=lambda x:x[2])
            if insertions[-2][2]>args.min_freq:
                insertions_to_include.append(insertions[-2])

        seq = "".join(consensus_seq)
        if insertions_to_include:
            complete_seq = ""
            pos = 0
            for ins_pos, ins, freq in sorted(insertions_to_include, key=lambda x:x[0]):
                complete_seq += seq[pos:ins_pos] + ins
                pos=ins_pos
                print(sample + ": inserted %s at position %d with frequency %f."%(ins, ins_pos, freq))
            complete_seq += seq[pos:]
            seq=complete_seq
            any_minors = True

        if len(ac)==1:
            seq_name = sample+'_minor'
        else:
            seq_name = sample + '_minor_' + ref
        seqs.append(SeqRecord.SeqRecord(id=seq_name, name=seq_name, description="", seq=Seq.Seq(seq)))

    if any_minors:
        SeqIO.write(seqs, args.out_dir+'/minor.fasta', 'fasta')
    else:
        os.system("touch "+args.out_dir+'/minor.fasta')
示例#19
0
import copy
import unittest
import warnings

from Bio import BiopythonWarning, BiopythonDeprecationWarning
from Bio import Seq
from Bio.Data.IUPACData import (
    ambiguous_dna_complement,
    ambiguous_rna_complement,
    ambiguous_dna_values,
    ambiguous_rna_values,
)
from Bio.Data.CodonTable import TranslationError, standard_dna_table

test_seqs = [
    Seq.Seq("TCAAAAGGATGCATCATG"),
    Seq.Seq("T"),
    Seq.Seq("ATGAAACTG"),
    Seq.Seq("ATGAARCTG"),
    Seq.Seq("AWGAARCKG"),  # Note no U or T
    Seq.Seq("".join(ambiguous_rna_values)),
    Seq.Seq("".join(ambiguous_dna_values)),
    Seq.Seq("AWGAARCKG"),
    Seq.Seq("AUGAAACUG"),
    Seq.Seq("ATGAAA-CTG"),
    Seq.Seq("ATGAAACTGWN"),
    Seq.Seq("AUGAAA==CUG"),
    Seq.Seq("AUGAAACUGWN"),
    Seq.Seq("AUGAAACTG"),  # U and T
    Seq.MutableSeq("ATGAAACTG"),
    Seq.MutableSeq("AUGaaaCUG"),
示例#20
0
def get_micro_homology_features(gene_names, learn_options, X):
    # originally was flipping the guide itself as necessary, but now flipping the gene instead

    print("building microhomology features")
    feat = pandas.DataFrame(index=X.index)
    feat["mh_score"] = ""
    feat["oof_score"] = ""

    #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f:
    if True:
        # number of nulceotides to take to the left and right of the guide
        k_mer_length_left = 9
        k_mer_length_right = 21
        for gene in gene_names.unique():
            gene_seq = Seq.Seq(
                util.get_gene_sequence(gene)).reverse_complement()
            guide_inds = np.where(gene_names.values == gene)[0]
            print("getting microhomology for all %d guides in gene %s" %
                  (len(guide_inds), gene))
            for j, ps in enumerate(guide_inds):
                guide_seq = Seq.Seq(X['30mer'][ps])
                strand = X['Strand'][ps]
                if strand == 'sense':
                    gene_seq = gene_seq.reverse_complement()
                # figure out the sequence to the left and right of this guide, in the gene
                ind = gene_seq.find(guide_seq)
                if ind == -1:
                    gene_seq = gene_seq.reverse_complement()
                    ind = gene_seq.find(guide_seq)
                    #assert ind != -1, "still didn't work"
                    #print "shouldn't get here"
                else:
                    #print "all good"
                    pass
                #assert ind != -1, "could not find guide in gene"
                if ind == -1:
                    #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene))
                    #if.write(str(gene) + "," + str(guide_seq))
                    mh_score = 0
                    oof_score = 0
                else:
                    #print "worked"

                    assert gene_seq[ind:(
                        ind + len(guide_seq))] == guide_seq, "match not right"
                    left_win = gene_seq[(ind - k_mer_length_left):ind]
                    right_win = gene_seq[(ind +
                                          len(guide_seq)):(ind +
                                                           len(guide_seq) +
                                                           k_mer_length_right)]

                    #if strand=='antisense':
                    #    # it's arbitrary which of sense and anti-sense we flip, we just want
                    #    # to keep them in the same relative alphabet/direction
                    #    left_win = left_win.reverse_complement()
                    #    right_win = right_win.reverse_complement()
                    assert len(left_win.tostring()) == k_mer_length_left
                    assert len(right_win.tostring()) == k_mer_length_right

                    sixtymer = str(left_win) + str(guide_seq) + str(right_win)
                    assert len(sixtymer) == 60, "should be of length 60"
                    mh_score, oof_score = microhomology.compute_score(sixtymer)

                feat.ix[ps, "mh_score"] = mh_score
                feat.ix[ps, "oof_score"] = oof_score
            print("computed microhomology of %s" % (str(gene)))

    return pandas.DataFrame(feat, dtype='float')
示例#21
0
 def test_translation_wrong_type(self):
     """Test translation table cannot be CodonTable."""
     seq = Seq.Seq("ATCGTA")
     with self.assertRaises(ValueError):
         seq.translate(table=ambiguous_dna_complement)
    orf = orf_dict[orf_key]

    # Get ref genome pos
    bp = pd.Series(orf.index, name='genome_pos')
    bp.index = bp.index + 1

    # Get codon position
    codon_number = pd.Series(bp.index / 3)
    codon_number = codon_number.apply(math.ceil)

    pos_in_codon = pd.Series(bp.index % 3).replace({0: 3})

    # Get reference protein sequence
    seq = ''.join(list(orf['Ref']))
    seq = seq.upper()
    seq = Seq.Seq(seq)
    prot_seq = seq.translate()
    prot_seq = pd.Series(list(prot_seq))
    prot_seq.index = prot_seq.index + 1

    # match AA to codon number
    ref_AA = codon_number.apply(
        func=lambda x: prot_seq[x])  # -1 to match 0-based indexing

    to_concat = pd.DataFrame({
        'bp': bp.reset_index(drop=True),
        'codon_number': codon_number,
        'pos_in_codon': pos_in_codon,
        'ref_AA': ref_AA
    })
    orf_df = pd.concat([to_concat, orf.reset_index(drop=True)], axis=1)
示例#23
0
 def setUp(self):
     self.s = Seq.Seq("TCAAAAGGATGCATCATG")
     self.dna = [
         Seq.Seq("ATCG"),
         Seq.Seq("gtca"),
         Seq.MutableSeq("GGTCA"),
         Seq.Seq("CTG-CA"),
     ]
     self.rna = [
         Seq.Seq("AUUUCG"),
         Seq.MutableSeq("AUUCG"),
         Seq.Seq("uCAg"),
         Seq.MutableSeq("UC-AG"),
         Seq.Seq("U.CAG"),
     ]
     self.nuc = [Seq.Seq("ATCG")]
     self.protein = [
         Seq.Seq("ATCGPK"),
         Seq.Seq("atcGPK"),
         Seq.Seq("T.CGPK"),
         Seq.Seq("T-CGPK"),
         Seq.Seq("MEDG-KRXR*"),
         Seq.MutableSeq("ME-K-DRXR*XU"),
         Seq.Seq("MEDG-KRXR@"),
         Seq.Seq("ME-KR@"),
         Seq.Seq("MEDG.KRXR@"),
     ]
     self.test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
示例#24
0
def str2seq(s, prt=False):
    if prt:
        alpha = Alphabet.ProteinAlphabet
    else:
        alpha = Alphabet.generic_dna
    return Seq.Seq(s, alpha)
示例#25
0
 def test_append_nucleotides(self):
     self.test_chars.append(Seq.Seq("A"))
     self.assertEqual(5, len(self.test_chars))
from Bio import SeqIO, Seq
import sys

for rec in SeqIO.parse(open(sys.argv[1]), "fasta"):
    seqstr = str(rec.seq)
    seqstr = '-------------------------------------------' + seqstr + '-------------------------------------------------------------------------------------------------------------'
    rec.seq = Seq.Seq(seqstr)
    SeqIO.write([rec], sys.stdout, "fasta")
示例#27
0
 def setUp(self):
     self.dna = [
         Seq.Seq("ATCG"),
         Seq.Seq("gtca"),
         Seq.MutableSeq("GGTCA"),
         Seq.Seq("CTG-CA"),
         "TGGTCA",
     ]
     self.rna = [
         Seq.Seq("AUUUCG"),
         Seq.MutableSeq("AUUCG"),
         Seq.Seq("uCAg"),
         Seq.MutableSeq("UC-AG"),
         Seq.Seq("U.CAG"),
         "UGCAU",
     ]
     self.nuc = [
         Seq.Seq("ATCG"),
         "UUUTTTACG",
     ]
     self.protein = [
         Seq.Seq("ATCGPK"),
         Seq.Seq("atcGPK"),
         Seq.Seq("T.CGPK"),
         Seq.Seq("T-CGPK"),
         Seq.Seq("MEDG-KRXR*"),
         Seq.MutableSeq("ME-K-DRXR*XU"),
         "TEDDF",
     ]
示例#28
0
def SwissIterator(handle):
    """Break up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     - Swiss-Prot aka SwissProt
     - TrEMBL
     - UniProtKB aka UniProt Knowledgebase

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format. See also the SeqIO support for "uniprot-xml" format.

    Rather than calling it directly, you are expected to use this
    parser via Bio.SeqIO.parse(..., format="swiss") instead.
    """
    swiss_records = SwissProt.parse(handle)
    for swiss_record in swiss_records:
        # Convert the SwissProt record to a SeqRecord
        seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
        record = SeqRecord.SeqRecord(
            seq,
            id=swiss_record.accessions[0],
            name=swiss_record.entry_name,
            description=swiss_record.description,
            features=[_make_seqfeature(*f) for f in swiss_record.features],
        )
        record.description = swiss_record.description
        for cross_reference in swiss_record.cross_references:
            if len(cross_reference) < 2:
                continue
            database, accession = cross_reference[:2]
            dbxref = "%s:%s" % (database, accession)
            if dbxref not in record.dbxrefs:
                record.dbxrefs.append(dbxref)
        annotations = record.annotations
        annotations['accessions'] = swiss_record.accessions
        if swiss_record.protein_existence:
            annotations['protein_existence'] = swiss_record.protein_existence
        if swiss_record.created:
            annotations['date'] = swiss_record.created[0]
            annotations['sequence_version'] = swiss_record.created[1]
        if swiss_record.sequence_update:
            annotations[
                'date_last_sequence_update'] = swiss_record.sequence_update[0]
            annotations['sequence_version'] = swiss_record.sequence_update[1]
        if swiss_record.annotation_update:
            annotations[
                'date_last_annotation_update'] = swiss_record.annotation_update[
                    0]
            annotations['entry_version'] = swiss_record.annotation_update[1]
        if swiss_record.gene_name:
            annotations['gene_name'] = swiss_record.gene_name
        annotations['organism'] = swiss_record.organism.rstrip(".")
        annotations['taxonomy'] = swiss_record.organism_classification
        annotations['ncbi_taxid'] = swiss_record.taxonomy_id
        if swiss_record.host_organism:
            annotations['organism_host'] = swiss_record.host_organism
        if swiss_record.host_taxonomy_id:
            annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
        if swiss_record.comments:
            annotations['comment'] = "\n".join(swiss_record.comments)
        if swiss_record.references:
            annotations['references'] = []
            for reference in swiss_record.references:
                feature = SeqFeature.Reference()
                feature.comment = " ".join("%s=%s;" % k_v
                                           for k_v in reference.comments)
                for key, value in reference.references:
                    if key == 'PubMed':
                        feature.pubmed_id = value
                    elif key == 'MEDLINE':
                        feature.medline_id = value
                    elif key == 'DOI':
                        pass
                    elif key == 'AGRICOLA':
                        pass
                    else:
                        raise ValueError("Unknown key %s found in references" %
                                         key)
                feature.authors = reference.authors
                feature.title = reference.title
                feature.journal = reference.location
                annotations['references'].append(feature)
        if swiss_record.keywords:
            record.annotations['keywords'] = swiss_record.keywords
        yield record
示例#29
0
 def setUp(self):
     self.s = Seq.Seq("TCAAAAGGATGCATCATG")
示例#30
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    tree_meta = {'alignment': args.alignment}
    attributes = ['branch_length']
    # check if tree is provided an can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            tree_meta['input_tree'] = args.tree
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed." % args.tree)
        return -1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.ancestral or args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference"
            )
            return -1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
        aln = sequences
    else:
        aln = args.alignment

    if args.output:
        tree_fname = args.output
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'

    if args.timetree and T:
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction"
            )
            return -1
        metadata, columns = read_metadata(args.metadata)
        if args.year_limit:
            args.year_limit.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_fmt,
                                    min_max_year=args.year_limit)
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        if args.root and len(
                args.root
        ) == 1:  #if anything but a list of seqs, don't send as a list
            args.root = args.root[0]

        tt = timetree(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.root or 'best',
            Tc=args.coalescent if args.coalescent is not None else
            0.01,  #Otherwise can't set to 0
            use_marginal=args.time_marginal or False,
            branch_length_mode=args.branch_length_mode or 'auto',
            clock_rate=args.clock_rate,
            n_iqd=args.n_iqd)

        tree_meta['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        attributes.extend([
            'numdate', 'clock_length', 'mutation_length', 'mutations',
            'raw_date', 'date'
        ])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
        if args.date_confidence:
            attributes.append('num_date_confidence')
    elif args.ancestral in ['joint', 'marginal']:
        tt = ancestral_sequence_inference(
            tree=T,
            aln=aln,
            ref=ref,
            marginal=args.ancestral,
            optimize_branch_length=args.branchlengths,
            branch_length_mode=args.branch_length_mode)
        attributes.extend(['mutation_length', 'mutations'])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    if is_vcf:
        #TreeTime overwrites ambig sites on tips during ancestral reconst.
        #Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    tree_meta['nodes'] = prep_tree(T, attributes, is_vcf)

    if T:
        import json
        tree_success = Phylo.write(T,
                                   tree_fname,
                                   'newick',
                                   format_branch_length='%1.8f')
        if args.node_data:
            node_data_fname = args.node_data
        else:
            node_data_fname = '.'.join(
                args.alignment.split('.')[:-1]) + '.node_data'

        with open(node_data_fname, 'w') as ofile:
            meta_success = json.dump(tree_meta, ofile)

    #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs
    if is_vcf and (args.ancestral or args.timetree):
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)

        return 0 if (tree_success and meta_success) else -1
    else:
        return -1