Exemplo n.º 1
0
def going_beyond():
	from load import load_nitrogenase_seq
	nitrogenase = load_nitrogenase_seq()
	#print nitrogenase

	from load import load_metagenome
	metagenome = load_metagenome()

	longest_snippet = ""
	k = 0
	while k < len(metagenome):
		i = 0
		while i < len(nitrogenase):
			j = 0
			while j < len(metagenome[k][1]):
				length = 0
				while  (i + length < len(nitrogenase)) and (j + length < len(metagenome[k][1])) and (nitrogenase[i + length] == metagenome[k][1][j + length]):
					length += 1
				if length > len(longest_snippet):
					longest_snippet = nitrogenase[i:i+length]
				j += 1 + length #adding length here makes the program run a little faster
			i += 1
		k += 1

	return longest_snippet
Exemplo n.º 2
0
def substring_checkc():
    """Returns the parts of the string that match"""
    nitrogenase = load_nitrogenase_seq()
    metagenome = load_metagenome()#So, this actually is a list of tubles with the name of a seqquence and then the sequence.
    
    for i in metagenome:
        print(i)
    pass
Exemplo n.º 3
0
def nitrogenase_substring():
    """ finds the metagenome with the longest substring in common with
    the nitrogenase sequence.

    I have no idea if it works, but it spit out an answer with no errors after over an hour
    of running, so i'm considering that success.

    """
    import load
    nit_seq = load.load_nitrogenase_seq()
    metagenomes = load.load_metagenome()
    subs = []
    for meta in metagenomes:
        subs.append((meta[0], longest_substring(meta[1], nit_seq)))
    return max(subs, key=lambda s:s[1])[0]
Exemplo n.º 4
0
import random
from amino_acids import aa, codons, aa_table   # you may find these useful

#Importing the metagenome

from load import load_metagenome
metagenome = load_metagenome()
# metagenome = 'ATGGGAAAACTCCGGCAGATCGCTTTCTACGGCAAGGGCGGGATCGGCAAGTCGACGACCTCGCAGAACACCCTCGCGGCACTGGTCGAGATGGGTCAGAAGATCCTCATCGTCGGCTGCGATCCCAAGGCCGACTCGACCCGCCTGATCCTGAACACCAAGCTGCAGGACACCGTGCTTCACCTCGCCGCCGAAGCGGGCTCCGTCGAGGATCTCGAACTCGAGGATGTGGTCAAGATCGGCTACAAGGGCATCAAATGCACCGAAGCCGGCGGGCCGGAGCCGGGCGTGGGCTGCGCGGGCCGCGGCGTCATCACCGCCATCAACTTCCTGGAAGAGAACGGCGCCTATGACGACGTCGACTACGTCTCCTACGACGTGCTGGGCGACGTGGTCTGCGGCGGCTTCGCCATGCCGATCCGCGAGAACAAGGCGCAGGAAATCTACATCGTCATGTCGGGCGAGATGATGGCGCTCTATGCGGCCAACAACATCGCCAAGGGCATCCTGAAATACGCGAACTCGGGCGGCGTGCGCCTCGGCGGCCTGATCTGCAACGAGCGCAAGACCGACCGCGAGCTGGAACTGGCCGAGGCCCTCGCCGCGCGTCTGGGCTGCAAGATGATCCACTTCGTTCCGCGCGACAATATCGTGCAGCACGCCGAGCTCCGCCGCGAGACGGTCATCCAGTATGCGCCCGAGAGCAAGCAGGCGCAGGAATATCGCGAACTGGCCCGCAAGATCCACGAGAACTCGGGCAAGGGCGTGATCCCGACCCCGATCACCATGGAAGAGCTGGAAGAGATGCTGATGGATTTCGGCATCATGCAGTCCGAGGAAGACCGGCTCGCCGCCATCGCCGCCGCCGAGGCCTGA'



#Loading the nitrogenase

from load import load_seq
from load import load_nitrogenase_seq
nitrogenase = load_nitrogenase_seq()


def get_complement(nucleotide):
    """ Returns the complementary nucleotide
        nucleotide: a nucleotide (A, C, G, or T) represented as a string
        returns: the complementary nucleotide
    >>> get_complement('A')
    'T'
    >>> get_complement('C')
    'G'
    >>> get_complement('G')
    'C'
    >>> get_complement('T')
    'A'
    >>> get_complement('L')
Exemplo n.º 5
0
from load import load_nitrogenase_seq
from load import load_metagenome


def longestSubstrings(meta, nitrogen):
    ans = []
    for m in meta:
        length = 0
        longest = ''
        for x in range(len(nitrogen)):
            span = 1
            while (nitrogen[x:x + span] in m[1] and x + span < len(nitrogen)):
                if (span > length):
                    length = span
                    longest = nitrogen[x:x + span]
                span += 1
        if (length >= 20):
            ans.append([m[0], length, longest])
    return ans


if __name__ == '__main__':
    nitrogen = load_nitrogenase_seq().replace('\n', '')
    meta = load_metagenome()
    print(longestSubstrings(meta, nitrogen))
Exemplo n.º 6
0
import pickle
import sys
from distance import levenshtein
from gene_finder import *

from load import load_metagenome
metagenome = load_metagenome()

# pauls_seq = 'GCCCGGACATTCTACATCTCCGCGAAAACACACACTTTTTCGTCTCCGGCGAAGCTTGGCACGCTCGTTGCAAAACAGGGATCAGCAAGGCGAGGGATGGTTGGCCGAGCAGTTACTGCAAAGGGCAACGTCCGCATCTGAGCCGTGCGACGGTTTTGAACGGAAGAAGGCTGCGCCTCGGCGCAAATCGATCAAGCGGCATTAGGTCAACGGAGAGAAAACATGGCACTTCGGCAAATCGCATTCTACGGCAAGGGCGGCATCGGCAAGTCGACCACCTCGCAGAACACCCTCGCGGCGCTGGTTGAGATGGGTCAGAAGATCCTGATCGTCGGCTGCGACCCCAAGGCGGACTCCACCCGTCTGATCCTCAACACCAAGATGCAGGACACGGTGCTGAGCCTCGCCGCGGAAGCGGGTTCGGTGGAAGACCTCGAACTCGAAGACGTGATGAAGATCGGCTACAAGGGCATCAAGTGCACCGAAGCCGGTGGCCCGGAGCCGGGCGTCGGCTGCGCCGGCCGCGGCGTTATCACCGCGATCAACTTCCTCGAAGAAAACGGCGCCTATGAAGACGTCGACTACGTCTCCTACGACGTGCTCGGCGACGTGGTGTGCGGCGGCTTCGCGATGCCGATCCGTGAAAACAAGGCGCAGGAAATCTACATCGTCATGTCCGGCGAGATGATGGCGCTGTATGCCGCCAACAACATCTCCAAGGGCATTCTGAAGTACGCTTCGTCGGGCGGCGTCCGTCTCGGCGGCCTGATCTGCAACGAGCGCCAGACCGACCGCGAGCTCGACCTCGCCGAAGCGCTGGCCAAGAAGCTGAACTCGAAGCTGATCCACTTCGTGCCGCGCGACAATATCGTGCAGCACGCCGAGCTGCGCCGCCAGACCGTGATCCAGTACGCGCCCGACAGCCAGCAGGCTAAGGAATATCGCGCCCTGGCCAACAAGGTCCATGCCAACTGCGGCAACGGCACCATCCCGACCCCGATCACCATGGAAGAGCTGGAAGAGATGCTGCTCGACTTCGGCATCATGAAGACCGAGGAGCAGCAGCTCGCCGAGCTCGCCGCCAAGGAAGCCGCCAAGGCGGCCGCGTCCGCCTGATCGCATCAGCCAGGCCGGTCGCCTAGCGCGACCGGCCGCCATCCCGGCGGCCCCAGACACGAGGAACAACGATGAGCACCGCAGTCGCAGAATCCCCCGCGGACATCAAGGAACGTAACAAGAAGCTGATCGGCGAAGTCCTGGAGGCCTATCCGGACAAGTCGGCCAAGCGTCGCGCCAAGCATCTCAACACGTACGACGCCGAGAAGGCGGAGTGCTCGGTCAAGTCCAACATCAAGTCGATCCCGGGCGTGATGACGATCCGCGGTTGCGCCTACGCCGGCTCGAAGGGCGTGGTGTGGGGCCCGATCAAGGACATGGTCCACATCAGCCACGGCCCGGTCGGCTGCGGCCAGTATTCGTGGGGTTCGCGCCGCAACTATTACAAGGGAACCACCGGCGTCGACACTTTCGGCACGATGCAGTTCACCTCCGACTTCCAGGAGAAGGACATCGTTTTCGGCGGTGACAAGAAGCTCGGCAAGATCATCGACGAGATCCAGGAGCTGTTCCCGCTCTCCAAGGGCATCTCGGTGCAGTCGGAATGCCCGATCGGTCTGATCGGCGACGACATCGAGGCGGTCTCCAAGGCCAAGTCGAAGCAGTATGACGGCAAGCCGATCATCCCGGTCCGCTGCGAAGGCTTCCGCGGCGTGTCGCAGTCGCTCGGCCACCACATCGCCAACGACGTGATCCGTGACTGGGTGTTCGACAAGGCCGCCGAGAAGAACGCCGGCTTCCAGTCGACCCCCTACGACGTCGCGATCATCGGCGACTACAACATCGGCGGCGATGCCTGGGCCTCGCGCATCCTGCTCGAGGAAATGGGCCTCCGCGTGATCGCGCAGTGGTCCGGCGACGGCACCATCGCGGAGCTGGAGAACACCCCGAAGGCGAAGCTGAACATCCTGCACTGCTACCGCTCGATGAACTACATCACGCGGCACATGGAAGAGAAGTTCGGTATTCCGTGGGTTGAATACAACTTCTTCGGCCCGTCCAAGATCGA'

#Loading the nitrogenase
from load import load_nitrogenase_seq
nitrogenase = load_nitrogenase_seq()

sys.setrecursionlimit(20000)

if __name__ == "__main__":

    i = 0

    holder_dna = []
    for a in metagenome[1:5]:
        dna = a[1]
        snippet = find_all_ORFs_both_strands(dna)
        for item in snippet:
            if len(item[0]) > .8 * len(nitrogenase):
                holder_dna.append(item)

    data_output = []

    lengths = [len(item[0]) for item in holder_dna]
Exemplo n.º 7
0
from load import load_nitrogenase_seq
from load import load_metagenome

def longestSubstrings(meta, nitrogen):
	ans = []
	for m in meta:
		length = 0
		longest = ''
		for x in range(len(nitrogen)):
			span = 1
			while(nitrogen[x:x+span] in m[1] and x+span < len(nitrogen)):
				if(span>length):
					length = span
					longest = nitrogen[x:x+span]
				span += 1
		if(length >= 20):
			ans.append([m[0], length, longest])
	return ans



if __name__ == '__main__':
	nitrogen = load_nitrogenase_seq().replace('\n', '')
	meta = load_metagenome()
	print(longestSubstrings(meta, nitrogen))
Exemplo n.º 8
0
from load import load_nitrogenase_seq
from load import load_metagenome
#import pypy
from gene_finder import *


nitrogenase = str(load_nitrogenase_seq())
metagenome = load_metagenome()
one_metagenome = metagenome[0] #a tuple

def longestSubstringMetagenomeSnippet(enzyme, metagenome_snippet):
    m = [[0] * (1 + len(metagenome_snippet)) for i in range(1 + len(enzyme))] #creates a list of lists
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(enzyme)): #x starts with 1, goes up to (1 + length of 1st string)
        for y in range(1, 1 + len(metagenome_snippet)): #y starts with 1, goes up to (1 + length of 2nd string)
            if enzyme[x - 1] == metagenome_snippet[y - 1]: 
                m[x][y] = m[x - 1][y - 1] + 1 #if the upper-left diagonal characters are equal, 
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return enzyme[x_longest - longest: x_longest]


def longestSubstringAllMetagenomeSnippets(enzyme, metagenome):
	i = 0
	ORF_lengths = []
	genome_names = [] #I think that's what you call a part of a metagenome?
	for name, metagenome_snippet in metagenome:
		ORF = longestSubstringMetagenomeSnippet(enzyme, metagenome_snippet)