def test_length_of_vectors(new_vectors_path: Path) -> None: """ Create a new database, add a whole bunch of files. """ examples = dict(file_a=to_source_vector(b'print("hello, world!")'), file_b=to_source_vector(b'import sys; sys.exit(0)'), file_c=to_source_vector(b'print(934 * 2 * 3442990 + 1)')) # Insert all the examples. vectors = Vectors.from_filename(new_vectors_path) for name, vector in examples.items(): vectors[name] = vector vectors.disconnect() # Reopen it and test the length vectors = Vectors.from_filename(new_vectors_path) # Test fetching all of them. actual = sum(len(vec) for vec in examples.values()) assert actual == vectors.length_of_vectors({'file_a', 'file_b', 'file_c'}) # Check that we can query an empty set. assert 0 == vectors.length_of_vectors(()) # Check that we can query a single item. assert len(examples['file_a']) == vectors.length_of_vectors({'file_a'}) # Check that we can query a subset. actual = sum(len(examples[name]) for name in ('file_a', 'file_c')) assert actual == vectors.length_of_vectors({'file_a', 'file_c'})
def test_source_vector_unk_conversion(): problematic_source = b'class _ { # }' with pytest.raises(OutOfVocabularyError): to_source_vector(problematic_source) vector = to_source_vector(problematic_source, oov_to_unk=True) assert 5 == len(vector) assert current_language.vocabulary.unk_token_index == vector[1] == vector[3]
def test(dirname: Path = None) -> None: from sensibility._paths import REPOSITORY_ROOT from sensibility.source_vector import to_source_vector if dirname is None: dirname = REPOSITORY_ROOT / 'tests' language.set('java') model = KerasDualLSTMModel.from_directory(dirname) source = to_source_vector(rb''' package ca.ualberta.cs; class HelloWorld { public static void main(String args[] /* Syntax error, delete token[19] to fix */ ... ) { System.out.println("Hello, World!"); } } ''') answer = model.predict_file(source) assert len(answer) == len(source) text = language.vocabulary.to_source_text for expected, predictions in zip(source, answer): actual_fw = text(predictions.forwards.argmax()) # type: ignore actual_bw = text(predictions.backwards.argmax()) # type: ignore print(f"{actual_fw:>14}\t{actual_bw:>14}\t{text(expected)}")
def test_insert(c) -> None: source_code = to_source_vector(b""" @SuppressWarnings({"fake", 0x1.8p1) class Hello {} """) edit = Insertion(7, to_index(c('}'))) mutant = edit.apply(source_code) expected = b'@ ident ( { "string" , 0.0 } ) class ident { }' actual = mutant.to_source_code() assert expected == actual assert language.check_syntax(actual)
def fix(self, source_file: bytes) -> Sequence[Edit]: """ Produces a ranked sequence of possible edits that will fix the file. If there are no possible fixes, the sequence will be empty. """ # Get file vector for the error'd file. file_vector = to_source_vector(source_file, oov_to_unk=True) tokens = tuple(language.tokenize(source_file)) predictions = self.model.predict_file(file_vector) # Holds the lowest agreement at each point in the file. results: List[IndexResult] = [] for index, pred in enumerate(predictions): vind = file_vector[index] token = tokens[index] prefix_pred = pred.forwards suffix_pred = pred.backwards # Figure out the agreement between models, and against the ground # truth. result = IndexResult(index, file_vector, prefix_pred, suffix_pred, token, vind) results.append(result) # Rank the results by some metric of similarity defined by IndexResult # (the top rank will be LEAST similar). ranked_results = tuple(sorted(results, key=float)) # For the top-k disagreements, synthesize fixes. # NOTE: k should be determined by the xentropy of the models! fixes = Fixes(file_vector) for disagreement in ranked_results[:self.k]: pos = disagreement.index likely_tokens = disagreement.best_suggestions() # Note: the order of these operations SHOULDN'T matter, # but typically we only report the first fix that works. # Because missing tokens are the most common # we'll try to insert tokens first, THEN delete. # Assume a deletion. Let's try inserting some tokens. for likely_token in likely_tokens: fixes.try_insert(pos, likely_token) # Assume an insertion. Let's try removing the offensive token. fixes.try_delete(pos) # Assume a substitution. Let's try swapping the token. for likely_token in likely_tokens: fixes.try_substitute(pos, likely_token) return tuple(fixes)
def test_delete(c) -> None: source_code = to_source_vector(b""" class Hello { } } """) edit = Deletion(3, to_index(c('}'))) mutant = edit.apply(source_code) expected = b'class ident { }' actual = mutant.to_source_code() assert expected == actual assert language.check_syntax(actual)
def test_substitution(c) -> None: source_code = to_source_vector(b""" @SuppressWarnings("fake"=0x1.8p1) class Hello {} """) edit = Substitution(3, original_token=to_index(c('"fake"')), replacement=to_index(c('ident'))) mutant = edit.apply(source_code) expected = b'@ ident ( ident = 0.0 ) class ident { }' actual = mutant.to_source_code() assert expected == actual assert language.check_syntax(actual)
def test_creates_file(new_vectors_path: Path) -> None: """ Create a new vector database, and test that reconnecting to it persists changes. """ hello_vector = to_source_vector(b'print("hello, world!")') vectors = Vectors.from_filename(new_vectors_path) vectors['hello'] = hello_vector vectors.disconnect() vectors = Vectors.from_filename(new_vectors_path) assert hello_vector == vectors['hello'] with pytest.raises(KeyError): vectors['non-existent']