def test_crc_checksum_collision(self): #Explicit testing of crc64 collision: self.assertNotEqual(self.str_light_chain_one, self.str_light_chain_two) self.assertNotEqual(crc32(self.str_light_chain_one), crc32(self.str_light_chain_two)) self.assertEqual(crc64(self.str_light_chain_one), crc64(self.str_light_chain_two)) self.assertNotEqual(gcg(self.str_light_chain_one), gcg(self.str_light_chain_two)) self.assertNotEqual(seguid(self.str_light_chain_one), seguid(self.str_light_chain_two))
def test_crc_checksum_collision(self): # Explicit testing of crc64 collision: self.assertNotEqual(self.str_light_chain_one, self.str_light_chain_two) self.assertNotEqual(crc32(self.str_light_chain_one), crc32(self.str_light_chain_two)) self.assertEqual(crc64(self.str_light_chain_one), crc64(self.str_light_chain_two)) self.assertNotEqual(gcg(self.str_light_chain_one), gcg(self.str_light_chain_two)) self.assertNotEqual(seguid(self.str_light_chain_one), seguid(self.str_light_chain_two))
def _load_reference(self, reference, rank, bioentry_id): """Record SeqRecord's annotated references in the database (PRIVATE). record - a SeqRecord object with annotated references bioentry_id - corresponding database identifier """ refs = None if reference.medline_id: refs = self.adaptor.execute_and_fetch_col0( "SELECT reference_id" " FROM reference JOIN dbxref USING (dbxref_id)" " WHERE dbname = 'MEDLINE' AND accession = %s", (reference.medline_id,), ) if not refs and reference.pubmed_id: refs = self.adaptor.execute_and_fetch_col0( "SELECT reference_id" " FROM reference JOIN dbxref USING (dbxref_id)" " WHERE dbname = 'PUBMED' AND accession = %s", (reference.pubmed_id,), ) if not refs: s = [] for f in reference.authors, reference.title, reference.journal: s.append(f or "<undef>") crc = crc64("".join(s)) refs = self.adaptor.execute_and_fetch_col0("SELECT reference_id FROM reference" r" WHERE crc = %s", (crc,)) if not refs: if reference.medline_id: dbxref_id = self._add_dbxref("MEDLINE", reference.medline_id, 0) elif reference.pubmed_id: dbxref_id = self._add_dbxref("PUBMED", reference.pubmed_id, 0) else: dbxref_id = None authors = reference.authors or None title = reference.title or None # The location/journal field cannot be Null, so default # to an empty string rather than None: journal = reference.journal or "" self.adaptor.execute( "INSERT INTO reference (dbxref_id, location," " title, authors, crc)" " VALUES (%s, %s, %s, %s, %s)", (dbxref_id, journal, title, authors, crc), ) reference_id = self.adaptor.last_id("reference") else: reference_id = refs[0] if reference.location: start = 1 + int(str(reference.location[0].start)) end = int(str(reference.location[0].end)) else: start = None end = None sql = ( "INSERT INTO bioentry_reference (bioentry_id, reference_id," " start_pos, end_pos, rank) VALUES (%s, %s, %s, %s, %s)" ) self.adaptor.execute(sql, (bioentry_id, reference_id, start, end, rank + 1))
def seq_checksums(self, seq_str, exp_crc32, exp_crc64, exp_gcg, exp_seguid, exp_simple_LCC, exp_window_LCC): for s in [seq_str, Seq(seq_str, single_letter_alphabet), MutableSeq(seq_str, single_letter_alphabet)]: self.assertEqual(exp_crc32, u_crc32(s)) self.assertEqual(exp_crc64, crc64(s)) self.assertEqual(exp_gcg, gcg(s)) self.assertEqual(exp_seguid, seguid(s)) self.assertEqual(exp_simple_LCC, simple_LCC(s)) self.assertEqual(exp_window_LCC, windowed_LCC(s))
def seq_checksums(self, seq_str, exp_crc32, exp_crc64, exp_gcg, exp_seguid, exp_simple_LCC, exp_window_LCC): for s in [seq_str, Seq(seq_str, single_letter_alphabet), MutableSeq(seq_str, single_letter_alphabet)]: self.assertEqual(exp_crc32, u_crc32(s)) self.assertEqual(exp_crc64, crc64(s)) self.assertEqual(exp_gcg, gcg(s)) self.assertEqual(exp_seguid, seguid(s)) self.assertAlmostEqual(exp_simple_LCC, lcc_simp(s), places=2) values = lcc_mult(s, 20) self.assertEqual(len(exp_window_LCC), len(values)) for value1, value2 in zip(exp_window_LCC, values): self.assertAlmostEqual(value1, value2, places=2)
################### # Example of crc64 collision from Sebastian Bassi using the # immunoglobulin lambda light chain variable region from H**o sapiens # Both sequences share the same CRC64 checksum: 44CAAD88706CC153 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ + "YCSSYAGSSTLVFGGGTKLTVL" str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ + "YCCSYAGSSTWVFGGGTKLTVL" #Explicit testing of crc64 collision: assert str_light_chain_one != str_light_chain_two assert crc32(str_light_chain_one) != crc32(str_light_chain_two) assert crc64(str_light_chain_one) == crc64(str_light_chain_two) assert gcg(str_light_chain_one) != gcg(str_light_chain_two) assert seguid(str_light_chain_one) != seguid(str_light_chain_two) ########################### # main checksum/LCC tests # ########################### #Print some output, which the test harness will check examples = [str_light_chain_one, str_light_chain_two, "ATGCGTATCGATCGCGATACGATTAGGCGGAT"] for i, seq_str in enumerate(examples): print "Example %i, length %i, %s..." % (i+1, len(seq_str), seq_str[:10]) #Avoid cross platforms with printing floats by doing conversion explicitly
def _load_reference(self, reference, rank, bioentry_id): """Record a SeqRecord's annotated references in the database (PRIVATE). record - a SeqRecord object with annotated references bioentry_id - corresponding database identifier """ refs = None if reference.medline_id: refs = self.adaptor.execute_and_fetch_col0( "SELECT reference_id" " FROM reference JOIN dbxref USING (dbxref_id)" " WHERE dbname = 'MEDLINE' AND accession = %s", (reference.medline_id, )) if not refs and reference.pubmed_id: refs = self.adaptor.execute_and_fetch_col0( "SELECT reference_id" " FROM reference JOIN dbxref USING (dbxref_id)" " WHERE dbname = 'PUBMED' AND accession = %s", (reference.pubmed_id, )) if not refs: s = [] for f in reference.authors, reference.title, reference.journal: s.append(f or "<undef>") crc = crc64("".join(s)) refs = self.adaptor.execute_and_fetch_col0( "SELECT reference_id FROM reference" r" WHERE crc = %s", (crc, )) if not refs: if reference.medline_id: dbxref_id = self._add_dbxref("MEDLINE", reference.medline_id, 0) elif reference.pubmed_id: dbxref_id = self._add_dbxref("PUBMED", reference.pubmed_id, 0) else: dbxref_id = None authors = reference.authors or None title = reference.title or None # The location/journal field cannot be Null, so default # to an empty string rather than None: journal = reference.journal or "" self.adaptor.execute( "INSERT INTO reference (dbxref_id, location," " title, authors, crc)" " VALUES (%s, %s, %s, %s, %s)", (dbxref_id, journal, title, authors, crc)) reference_id = self.adaptor.last_id("reference") else: reference_id = refs[0] if reference.location: start = 1 + int(str(reference.location[0].start)) end = int(str(reference.location[0].end)) else: start = None end = None sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ " start_pos, end_pos, rank)" \ " VALUES (%s, %s, %s, %s, %s)" self.adaptor.execute(sql, (bioentry_id, reference_id, start, end, rank + 1))
################### # Example of crc64 collision from Sebastian Bassi using the # immunoglobulin lambda light chain variable region from H**o sapiens # Both sequences share the same CRC64 checksum: 44CAAD88706CC153 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ + "YCSSYAGSSTLVFGGGTKLTVL" str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ + "YCCSYAGSSTWVFGGGTKLTVL" #Explicit testing of crc64 collision: assert str_light_chain_one != str_light_chain_two assert crc32(str_light_chain_one) != crc32(str_light_chain_two) assert crc64(str_light_chain_one) == crc64(str_light_chain_two) assert gcg(str_light_chain_one) != gcg(str_light_chain_two) assert seguid(str_light_chain_one) != seguid(str_light_chain_two) ########################### # main checksum/LCC tests # ########################### #Print some output, which the test harness will check examples = [str_light_chain_one, str_light_chain_two, "ATGCGTATCGATCGCGATACGATTAGGCGGAT"] for i, seq_str in enumerate(examples) : print "Example %i, length %i, %s..." % (i+1, len(seq_str), seq_str[:10]) #Avoid cross platforms with printing floats by doing conversion explicitly