def test_scale(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) filename_left = self.empty() filename_right = self.empty() with utils.open_profile(self.profile(counts_left, 8)) as handle_left: with utils.open_profile(self.profile(counts_right, 8)) as handle_right: with utils.open_profile(filename_left, 'w') as out_left: with utils.open_profile(filename_right, 'w') as out_right: kmer.scale(handle_left, handle_right, out_left, out_right) if sum(counts_left.values()) < sum(counts_right.values()): scale_left = sum(counts_right.values()) / sum(counts_left.values()) scale_right = 1.0 else: scale_left = 1.0 scale_right = sum(counts_left.values()) / sum(counts_right.values()) for s in counts_left: counts_left[s] *= scale_left for s in counts_right: counts_right[s] *= scale_right utils.test_profile_file(filename_left, counts_left, 8) utils.test_profile_file(filename_right, counts_right, 8)
def namedEntitySimilarityFeatureExtractor(self, originalDocuments, machineSummary, humanSummaries): peer = utils.removeStopwords( utils.getNamedEntities(self.nerTagger, machineSummary)) modelsOD = [ utils.removeStopwords( utils.getNamedEntities(self.nerTagger, document)) for document in originalDocuments ] modelsHS = [ utils.removeStopwords( utils.getNamedEntities(self.nerTagger, document)) for document in humanSummaries ] peerCount = utils.counts(peer) modelsODCount = [utils.counts(model) for model in modelsOD] modelsHSCount = [utils.counts(model) for model in modelsHS] jsAvgOD = [ js.JS_Divergence(peerCount, model) for model in modelsODCount ] jsAvgHS = [ js.JS_Divergence(peerCount, model) for model in modelsHSCount ] resultOD = sum(jsAvgOD) / float(len(modelsODCount)) resultHS = sum(jsAvgHS) / float(len(modelsHSCount)) if resultOD > 1: resultOD = 1 if resultHS > 1: resultHS = 1 return [resultOD, resultHS]
def test_ProfileDistance_distance_k8(self): counts_a = utils.counts(utils.SEQUENCES_LEFT, 8) counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8) profile_a = klib.Profile(utils.as_array(counts_a, 8)) profile_b = klib.Profile(utils.as_array(counts_b, 8)) k_dist = kdistlib.ProfileDistance() np.testing.assert_almost_equal(k_dist.distance(profile_a, profile_b), 0.4626209322)
def test_profile_merge(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) profile_left = klib.Profile(utils.as_array(counts_left, 8)) profile_right = klib.Profile(utils.as_array(counts_right, 8)) profile_left.merge(profile_right) utils.test_profile(profile_left, counts_left + counts_right, 8)
def test_distance_smooth(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.profile(counts_left, 8, 'left')) as handle_left: with utils.open_profile(self.profile(counts_right, 8, 'right')) as handle_right: kmer.distance(handle_left, handle_right, out, do_smooth=True, precision=3) assert out.getvalue() == 'left right 0.077\n'
def test_distance(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.profile(counts_left, 8, 'left')) as handle_left: with utils.open_profile(self.profile(counts_right, 8, 'right')) as handle_right: kmer.distance(handle_left, handle_right, out) assert out.getvalue() == 'left right %.10f\n' % 0.4626209323
def test_count_multi(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) filename = self.empty() with open(self.fasta(utils.SEQUENCES_LEFT)) as handle_left: with open(self.fasta(utils.SEQUENCES_RIGHT)) as handle_right: with utils.open_profile(filename, 'w') as profile_handle: kmer.count([handle_left, handle_right], profile_handle, 8, names=['a', 'b']) utils.test_profile_file(filename, counts_left, 8, name='a') utils.test_profile_file(filename, counts_right, 8, name='b')
def test_merge(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) filename = self.empty() with utils.open_profile(self.profile(counts_left, 8)) as handle_left: with utils.open_profile(self.profile(counts_right, 8)) as handle_right: with utils.open_profile(filename, 'w') as profile_handle: kmer.merge(handle_left, handle_right, profile_handle) utils.test_profile_file(filename, counts_left + counts_right, 8)
def test_distance_pairwise_name(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.profile(counts_left, 8, 'left')) as handle_left: with utils.open_profile(self.profile(counts_right, 8, 'right')) as handle_right: kmer.distance(handle_left, handle_right, out, precision=3, custom_pairwise='numpy.multiply') assert out.getvalue() == 'left right 0.084\n'
def test_distance_smooth_expr(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.profile(counts_left, 8, 'left')) as handle_left: with utils.open_profile(self.profile(counts_right, 8, 'right')) as handle_right: kmer.distance(handle_left, handle_right, out, do_smooth=True, precision=3, custom_summary='np.max(values)') assert out.getvalue() == 'left right 0.474\n'
def test_distance_pairwise_expr(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.profile(counts_left, 8, 'left')) as handle_left: with utils.open_profile(self.profile(counts_right, 8, 'right')) as handle_right: kmer.distance(handle_left, handle_right, out, precision=3, custom_pairwise='abs(left - right) / (left + right + 1000)') assert out.getvalue() == 'left right 0.001\n'
def test_cat_prefixes(self): counts_a = utils.counts(utils.SEQUENCES_LEFT, 8) counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8) filename = self.empty() with utils.open_profile(self.profile(counts_a, 8, name='X')) as handle_a: with utils.open_profile(self.profile(counts_b, 8, name='X')) as handle_b: with utils.open_profile(filename, 'w') as profile_handle: kmer.cat([handle_a, handle_b], profile_handle, prefixes=['a_', 'b_']) utils.test_profile_file(filename, counts_a, 8, name='a_X') utils.test_profile_file(filename, counts_b, 8, name='b_X')
def test_distance_matrix_two(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) profiles = [klib.Profile(utils.as_array(counts_left, 8), 'a'), klib.Profile(utils.as_array(counts_right, 8), 'b')] k_dist = kdistlib.ProfileDistance() out = StringIO() kdistlib.distance_matrix(profiles, out, 2, k_dist) assert out.getvalue().strip().split('\n') == ['2', 'a', 'b', '0.46']
def test_ProfileDistance_distance_unmodified(self): counts_a = utils.counts(utils.SEQUENCES_LEFT, 8) counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8) profile_a = klib.Profile(utils.as_array(counts_a, 8)) profile_b = klib.Profile(utils.as_array(counts_b, 8)) k_dist = kdistlib.ProfileDistance(do_balance=True) k_dist.distance(profile_a, profile_b) utils.test_profile(profile_a, counts_a, 8) utils.test_profile(profile_b, counts_b, 8)
def test_distance_matrix_smooth(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.multi_profile(8, [counts_left, counts_right, counts_left], ['a', 'b', 'c'])) as handle: kmer.distance_matrix(handle, out, do_smooth=True, precision=3) assert out.getvalue().strip().split('\n') == ['3', 'a', 'b', 'c', '0.077', '0.000 0.077']
def test_distance_matrix_pairwise_expr(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.multi_profile(8, [counts_left, counts_right, counts_left], ['a', 'b', 'c'])) as handle: kmer.distance_matrix(handle, out, precision=3, custom_pairwise='abs(left - right) / (left + right + 1000)') assert out.getvalue().strip().split('\n') == ['3', 'a', 'b', 'c', '0.001', '0.000 0.001']
def test_count_multi_by_record(self): counts_by_record_left = [utils.counts(record, 8) for record in utils.SEQUENCES_LEFT] counts_by_record_right = [utils.counts(record, 8) for record in utils.SEQUENCES_RIGHT] names_left = [str(i) for i, _ in enumerate(counts_by_record_left)] names_right = [str(i) for i, _ in enumerate(counts_by_record_right)] filename = self.empty() with open(self.fasta(utils.SEQUENCES_LEFT, names=names_left)) as handle_left: with open(self.fasta(utils.SEQUENCES_RIGHT, names=names_right)) as handle_right: with utils.open_profile(filename, 'w') as profile_handle: kmer.count([handle_left, handle_right], profile_handle, 8, names=['a', 'b'], by_record=True) for name, counts in zip(names_left, counts_by_record_left): utils.test_profile_file(filename, counts, 8, name='a_' + name) for name, counts in zip(names_right, counts_by_record_right): utils.test_profile_file(filename, counts, 8, name='b_' + name)
def test_merge_custom_name(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) filename = self.empty() with utils.open_profile(self.profile(counts_left, 8)) as handle_left: with utils.open_profile(self.profile(counts_right, 8)) as handle_right: with utils.open_profile(filename, 'w') as profile_handle: kmer.merge(handle_left, handle_right, profile_handle, custom_merger='numpy.multiply') counts_mult = Counter(dict((s, counts_left[s] * counts_right[s]) for s in set(counts_left) & set(counts_right))) utils.test_profile_file(filename, counts_mult, 8)
def test_distance_matrix_pairwise_name(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) out = StringIO() with utils.open_profile(self.multi_profile(8, [counts_left, counts_right, counts_left], ['a', 'b', 'c'])) as handle: kmer.distance_matrix(handle, out, precision=3, custom_pairwise='numpy.multiply') assert out.getvalue().strip().split('\n') == ['3', 'a', 'b', 'c', '0.084', '1.206 0.084']
def test_distance_matrix_two(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) profiles = [ klib.Profile(utils.as_array(counts_left, 8), 'a'), klib.Profile(utils.as_array(counts_right, 8), 'b') ] k_dist = kdistlib.ProfileDistance() out = StringIO() kdistlib.distance_matrix(profiles, out, 2, k_dist) assert out.getvalue().strip().split('\n') == ['2', 'a', 'b', '0.46']
def test_merge_custom_expr(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) filename = self.empty() with utils.open_profile(self.profile(counts_left, 8)) as handle_left: with utils.open_profile(self.profile(counts_right, 8)) as handle_right: with utils.open_profile(filename, 'w') as profile_handle: kmer.merge(handle_left, handle_right, profile_handle, custom_merger='(left + right) * np.logical_xor(left, right)') counts_xor = counts_left + counts_right for s in set(counts_left) & set(counts_right): del counts_xor[s] utils.test_profile_file(filename, counts_xor, 8)
def test_count(self): counts = utils.counts(utils.SEQUENCES, 8) filename = self.empty() with open(self.fasta(utils.SEQUENCES)) as fasta_handle: with utils.open_profile(filename, 'w') as profile_handle: kmer.count([fasta_handle], profile_handle, 8) utils.test_profile_file(filename, counts, 8)
def _test_profile_split(self, sequences, length): counts = utils.counts(sequences, length) profile = klib.Profile(utils.as_array(counts, length)) left, right = profile.split() assert len(left) == len(right) assert sum(left) + sum(right) == sum(counts.values()) * 2 indices_left = {} indices_right = {} indices_palindrome = {} for s, c in counts.items(): r = utils.reverse_complement(s) if s < r: indices_left[utils.count_index(s)] = c * 2 elif s > r: indices_right[utils.count_index(r)] = counts[s] * 2 else: indices_palindrome[utils.count_index(s)] = c assert ([c for c in left if c > 0] == [ c for i, c in sorted( list(indices_left.items()) + list(indices_palindrome.items())) ]) assert ([c for c in right if c > 0] == [ c for i, c in sorted( list(indices_right.items()) + list(indices_palindrome.items())) ])
def test_main_info(self, capsys): # For the `capsys` fixture, see: # http://pytest.org/latest/capture.html counts = utils.counts(utils.SEQUENCES, 8) filename = self.profile(counts, 8, 'a') kmer.main(['info', filename]) out, err = capsys.readouterr() expected = 'File format version: 1.0.0\n' expected += 'Produced by: kMer unit tests\n\n' expected += 'Profile: a\n' expected += '- k-mer length: 8 (%d k-mers)\n' % (4**8) expected += '- Zero counts: %i\n' % (4**8 - len(counts)) expected += '- Non-zero counts: %i\n' % len(counts) expected += '- Sum of counts: %i\n' % sum(counts.values()) expected += '- Mean of counts: %.3f\n' % np.mean([0] * (4**8 - len(counts)) + list(counts.values())) expected += '- Median of counts: %.3f\n' % np.median( [0] * (4**8 - len(counts)) + list(counts.values())) expected += '- Standard deviation of counts: %.3f\n' % np.std( [0] * (4**8 - len(counts)) + list(counts.values())) assert out == expected
def test_positive(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) filename_left = self.empty() filename_right = self.empty() with utils.open_profile(self.profile(counts_left, 8)) as handle_left: with utils.open_profile(self.profile(counts_right, 8)) as handle_right: with utils.open_profile(filename_left, 'w') as out_left: with utils.open_profile(filename_right, 'w') as out_right: kmer.positive(handle_left, handle_right, out_left, out_right) utils.test_profile_file(filename_left, Counter(s for s in counts_left.elements() if s in counts_right), 8) utils.test_profile_file(filename_right, Counter(s for s in counts_right.elements() if s in counts_left), 8)
def test_profile_from_file(self): counts = utils.counts(utils.SEQUENCES, 4) with utils.open_profile(self.profile(counts, 4), 'r') as profile_handle: profile = klib.Profile.from_file(profile_handle) utils.test_profile(profile, counts, 4)
def test_convert(self): counts = utils.counts(utils.SEQUENCES, 8) filename = self.empty() with open(self.profile_old_format(counts, 8)) as handle: with utils.open_profile(filename, 'w') as profile_handle: kmer.convert([handle], profile_handle) utils.test_profile_file(filename, counts, 8)
def test_profile_reverse_complement(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna(profile.reverse_complement(i)) == utils.reverse_complement(profile.binary_to_dna(i)))
def _test_profile_split(self, sequences, length): counts = utils.counts(sequences, length) profile = klib.Profile(utils.as_array(counts, length)) left, right = profile.split() assert len(left) == len(right) assert sum(left) + sum(right) == sum(counts.values()) * 2 indices_left = {} indices_right = {} indices_palindrome = {} for s, c in counts.items(): r = utils.reverse_complement(s) if s < r: indices_left[utils.count_index(s)] = c * 2 elif s > r: indices_right[utils.count_index(r)] = counts[s] * 2 else: indices_palindrome[utils.count_index(s)] = c assert ([c for c in left if c > 0] == [c for i, c in sorted(list(indices_left.items()) + list(indices_palindrome.items()))]) assert ([c for c in right if c > 0] == [c for i, c in sorted(list(indices_right.items()) + list(indices_palindrome.items()))])
def test_profile_reverse_complement_palindrome(self): counts = utils.counts(['ACCTAGGT'], 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna(profile.reverse_complement(i)) == utils.reverse_complement(profile.binary_to_dna(i)))
def test_get_balance(self): counts = utils.counts(utils.SEQUENCES, 8) out = StringIO() with utils.open_profile(self.profile(counts, 8)) as input_handle: kmer.get_balance(input_handle, out, precision=3) assert out.getvalue() == '1 0.669\n'
def test_profile_print_counts(self, capsys): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.print_counts() out, err = capsys.readouterr() assert out == ''.join('%s %d\n' % (''.join(s), counts[''.join(s)]) for s in itertools.product('ACGT', repeat=4))
def test_profile_balance(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) profile.balance() counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 8)
def test_profile_balance_palindrome(self): counts = utils.counts(['AATT'], 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.balance() counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 4)
def test_profile_reverse_complement_palindrome(self): counts = utils.counts(['ACCTAGGT'], 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna( profile.reverse_complement(i)) == utils.reverse_complement( profile.binary_to_dna(i)))
def test_profile_reverse_complement(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna( profile.reverse_complement(i)) == utils.reverse_complement( profile.binary_to_dna(i)))
def _test_from_fasta_by_record(self, sequences, k, prefix=None): counts_by_record = [utils.counts(sequence, k) for sequence in sequences] names = [str(i) for i, _ in enumerate(counts_by_record)] with open(self.fasta(sequences, names=names)) as fasta_handle: profiles = klib.Profile.from_fasta_by_record(fasta_handle, k, prefix=prefix) for name, counts, profile in zip(names, counts_by_record, profiles): prefixed_name = prefix + '_' + name if prefix else name utils.test_profile(profile, counts, k, name=prefixed_name)