def test_distance_func_with_kwarg_works(tmpdir, sample_config): sample_config['DYESCORE_DATA_DIR'] = tmpdir.strpath ds = DyeScore(write_config_file(tmpdir, sample_config)) random_array = np.random.rand(5, 2) snippet_ids = ['0', '1', '2', '3', '4'] # 0 index for sanity :D data = xr.DataArray(random_array, coords={ 'snippet': snippet_ids, 'symbol': ['window.navigator', 'canvas.context'], }, dims=('snippet', 'symbol')) f = ds.dye_score_data_file('snippets') data.to_dataset(name='data').to_zarr(store=ds.get_zarr_store(f)) # Run Test dye_snippets = ['2'] weights = [0.4, 0.6] kwargs = dict(w=weights) result_file = ds.compute_distances_for_dye_snippets( dye_snippets, distance_function='cosine', override=True, **kwargs) # Check Results results = xr.open_zarr(store=ds.get_zarr_store(result_file))['data'] assert results.shape == (5, 1) for s in snippet_ids: actual_result = results.sel(snippet=s, dye_snippet='2').values expected_result = cosine(random_array[2], random_array[int(s)], w=weights) assert actual_result == expected_result
def test_data_validation_with_invalid_file(tmpdir, sample_config): # Set-up invalid data file and save config data_file = os.path.join(tmpdir, 'data.csv') df = pd.DataFrame({'a': [1, 2, 3]}) df.to_csv(data_file) sample_config['INPUT_PARQUET_LOCATION'] = data_file config_file = write_config_file(tmpdir, sample_config) ds = DyeScore(config_file) # Test with pytest.raises(ArrowIOError): ds.validate_input_data()
def test_data_validation_with_valid_file(tmpdir, sample_config): # Set-up valid data file and save config data_file = os.path.join(tmpdir, 'data.parquet') daskify( pd.DataFrame({ 'top_level_url': ['a', 'b'], 'document_url': ['a', 'b'], 'script_url': ['c', 'd'], 'symbol': ['e', 'f'], 'func_name': ['g', 'h'] })).to_parquet(data_file) sample_config['INPUT_PARQUET_LOCATION'] = data_file config_file = write_config_file(tmpdir, sample_config) ds = DyeScore(config_file) # Test assert ds.validate_input_data() is True
def test_passing_unavailable_string_fails(tmpdir, sample_config): # Setup sample_config['DYESCORE_DATA_DIR'] = tmpdir.strpath ds = DyeScore(write_config_file(tmpdir, sample_config)) random_array = np.random.rand(5, 2) snippet_ids = ['0', '1', '2', '3', '4'] data = xr.DataArray(random_array, coords={ 'snippet': snippet_ids, 'symbol': ['window.navigator', 'canvas.context'], }, dims=('snippet', 'symbol')) f = ds.dye_score_data_file('snippets') data.to_dataset(name='data').to_zarr(store=ds.get_zarr_store(f)) # Run Test with pytest.raises(KeyError): ds.compute_distances_for_dye_snippets( ['2'], override=True, distance_function='euclidean', )