def ComputeEdges(self, seqs): edge_dict = dict() for i in range(len(seqs)): for j in range(i + 1, len(seqs)): edge_dict[(i, j)] = utils.HammingDistance(seqs[i].seq, seqs[j].seq) return edge_dict
def _ConstructGraph(self, seqs): graph = Graph() graph.add_vertices(len(seqs)) edges = [] edge_weights = dict() for i in range(len(seqs)): for j in range(i + 1, len(seqs)): edges.append((i, j)) edge_weights[(i, j)] = utils.HammingDistance(seqs[i], seqs[j]) graph.add_edges(edges) return graph, edge_weights
def NormalizedHammingDistance(bin_text, keysize): """Computes the normalized average Hamming distance between consecutive pairs of bin_text blocks of KEYSIZE bytes. The lowest this result is, the more are consecutive blocks of KEYSIZE bytes likely to be similar to each other; this means that they were likely encoded with the same set of characters, and therefore the key could be of KEYSIZE length (in bytes). """ hamming_distance = 0 # Number of blocks of KEYSIZE bytes in the text. num_blocks = int(len(bin_text) / (keysize * 8)) # Keeps track of the first bit of the pair of chunks we # are considering start_block_index = 0 # Takes the first pair of blocks of KEYSIZE bytes, then # the second pair, etc... (no overlapping). For each pair, # compare the Hamming distance between the two blocks and sum # it to our accumulator. # For some keysizes there are leftover data at the end that # cannot be divided in two chunks of the required size; in that # case we ignore them and stop before. end_block = len(bin_text) - (keysize * 16) while start_block_index <= end_block: # Each binary chunk is therefore of size keysize*8. chunk1 = bin_text[start_block_index : start_block_index + keysize*8] chunk2 = bin_text[start_block_index + keysize*8 : start_block_index + keysize*16] hamming_distance += utils.HammingDistance(chunk1, chunk2) start_block_index += (keysize * 16) # The normalized distance is the Hamming distance divided by the # number of blocks and the key size. return (hamming_distance / (num_blocks * keysize))
#!/usr/bin/env python ''' Content: solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. Associated textbook: "Bioinformatics Algorithms: An Active-Learning Approach" by Phillip Compeau & Pavel Pevzner. Assignment: hosted in Stepik.org Problem Title: Hamming Distance Problem URL: https://stepik.org/lesson/9/step/3?course=Stepic-Interactive-Text-for-Week-2&unit=8224 Code Challenge: Hamming Distance Problem: Compute the Hamming distance between two strings. Input: Two strings of equal length. Output: The Hamming distance between these strings. ''' import sys import utils import numpy as np if __name__ == '__main__': seq1 = sys.stdin.readline()[:-1] seq2 = sys.stdin.readline()[:-1] print(utils.HammingDistance(seq1,seq2))