def process(in_filename, out_filename): in_file = open(in_filename) out_file = open(out_filename, 'w') header, sequence = utils.load(in_file) in_file.close() out_file.write(header + '\n') out_file.write('\n'.join(utils.prepare_subsequences(utils.translate(sequence), 80))) out_file.close()
def create_from_sequence(sequence): states = list(set(sequence)) observations = list(set([utils.translate(g) for g in states])) hmm = HMM(states, observations) gene_count = {g: {h: 0.0 for h in states} for g in states} for i, g in enumerate(sequence[:-1]): gene_count[g][sequence[i + 1]] += 1.0 def normalize(value_dict): n = float(sum(value_dict.values())) if n == 0: n = 1 return {k: v / n for k, v in value_dict.items()} gene_count = {k: normalize(v) for k, v in gene_count.items()} emissions = {g: {o: 1.0 if utils.translate(g) == o else 0.0 for o in observations} for g in states} hmm.emissions = emissions hmm.transitions = gene_count return hmm
def check_translation(gene, aminoacid): assert utils.translate(gene) == aminoacid