def main(): """Anonymize a set of analysis files using the same replacements in all of them. This maintains enough consistency to make splitting/merging tests realistic. To run: ./pants goal run src/python/pants/backend/jvm/tasks/jvm_compile:anonymize_zinc_analysis -- \ <wordfile> <classes dir in analysis files> <analysis file glob 1> <analysis file glob 2> ... """ word_file = sys.argv[1] classes_dir = sys.argv[2] analysis_files = list(itertools.chain.from_iterable([glob.glob(p) for p in sys.argv[3:]])) with open(word_file, 'r') as infile: word_list = infile.read().split() anonymizer = Anonymizer(word_list) for analysis_file in analysis_files: analysis = ZincAnalysisParser(classes_dir).parse_from_path(analysis_file) analysis.anonymize(anonymizer) output_dir = os.path.join(os.path.dirname(analysis_file), 'anon') safe_mkdir(output_dir) anonymized_filename = anonymizer.convert(os.path.basename(analysis_file)) analysis.write_to_path(os.path.join(output_dir, anonymized_filename)) anonymizer.check_for_comprehensiveness()
def main(): """Anonymize a set of analysis files using the same replacements in all of them. This maintains enough consistency to make splitting/merging tests realistic. In particular, it preserves dictionary order, so that representative class selection is consistent after anonymization. To run: ./pants run src/python/pants/backend/jvm/tasks/jvm_compile:anonymize_zinc_analysis -- \ <wordfile> <analysis file glob 1> <analysis file glob 2> ... Output will be in a directory called 'anon' under the directory of each input analysis file. An easy way to generate a wordfile is to download SCOWL (http://wordlist.aspell.net/) and look at final/english-words.*. A good wordfile can be had thus: for f in english-words.*; do cat $f >> wordfile; done egrep '^[a-z]{4}[a-z]*$' wordfile > wordfile.filtered To throw some non-ASCII characters into the mix, try e.g., cat wordfile.filtered | tr a ā > wordfile.filtered.utf8 If you copy-paste the command above into an OS X terminal, it'll do the right thing, assuming your terminal uses utf-8 encoding. Note that the larger the number at the end of the filename the rarer the words in it, so if you want to avoid rare words, manually cat the lowest few files into wordfile, until you have enough words. """ word_file = sys.argv[1] analysis_files = list( itertools.chain.from_iterable([glob.glob(p) for p in sys.argv[2:]])) with open(word_file, 'r') as infile: word_list = [w.decode('utf-8') for w in infile.read().split()] # First pass: Capture all words that need translating. translation_capturer = TranslationCapturer(word_list, strict=True) for analysis_file in analysis_files: analysis = ZincAnalysisParser().parse_from_path(analysis_file) analysis.translate(translation_capturer) translation_capturer.convert(os.path.basename(analysis_file)) translation_capturer.check_for_comprehensiveness() # Second pass: Actually translate, in order-preserving fashion. anonymizer = translation_capturer.get_order_preserving_anonymizer() for analysis_file in analysis_files: analysis = ZincAnalysisParser().parse_from_path(analysis_file) analysis.translate(anonymizer) output_dir = os.path.join(os.path.dirname(analysis_file), 'anon') safe_mkdir(output_dir) anonymized_filename = anonymizer.convert( os.path.basename(analysis_file)) analysis.write_to_path(os.path.join(output_dir, anonymized_filename)) anonymizer.check_for_comprehensiveness()
def main(): """Anonymize a set of analysis files using the same replacements in all of them. This maintains enough consistency to make splitting/merging tests realistic. In particular, it preserves dictionary order, so that representative class selection is consistent after anonymization. To run: ./pants run src/python/pants/backend/jvm/tasks/jvm_compile:anonymize_zinc_analysis -- \ <wordfile> <analysis file glob 1> <analysis file glob 2> ... Output will be in a directory called 'anon' under the directory of each input analysis file. An easy way to generate a wordfile is to download SCOWL (http://wordlist.aspell.net/) and look at final/english-words.*. A good wordfile can be had thus: for f in english-words.*; do cat $f >> wordfile; done egrep '^[a-z]{4}[a-z]*$' wordfile > wordfile.filtered To throw some non-ASCII characters into the mix, try e.g., cat wordfile.filtered | tr a ā > wordfile.filtered.utf8 If you copy-paste the command above into an OS X terminal, it'll do the right thing, assuming your terminal uses utf-8 encoding. Note that the larger the number at the end of the filename the rarer the words in it, so if you want to avoid rare words, manually cat the lowest few files into wordfile, until you have enough words. """ word_file = sys.argv[1] analysis_files = list(itertools.chain.from_iterable([glob.glob(p) for p in sys.argv[2:]])) with open(word_file, 'r') as infile: word_list = [w.decode('utf-8') for w in infile.read().split()] # First pass: Capture all words that need translating. translation_capturer = TranslationCapturer(word_list, strict=True) for analysis_file in analysis_files: analysis = ZincAnalysisParser().parse_from_path(analysis_file) analysis.translate(translation_capturer) translation_capturer.convert(os.path.basename(analysis_file)) translation_capturer.check_for_comprehensiveness() # Second pass: Actually translate, in order-preserving fashion. anonymizer = translation_capturer.get_order_preserving_anonymizer() for analysis_file in analysis_files: analysis = ZincAnalysisParser().parse_from_path(analysis_file) analysis.translate(anonymizer) output_dir = os.path.join(os.path.dirname(analysis_file), 'anon') safe_mkdir(output_dir) anonymized_filename = anonymizer.convert(os.path.basename(analysis_file)) analysis.write_to_path(os.path.join(output_dir, anonymized_filename)) anonymizer.check_for_comprehensiveness()
def _generate_testworthy_splits(self): """Take some non-canonical analysis files and generate test data from them. The resulting files will be "canonical". That is, merging and re-splitting them will yield the same files. Therefore the resulting files can be used as test data (after eyeballing them to ensure no pathologies). An easy way to generate input for this function is to run a scala compile on some targets using --strategy=isolated. Then .pants.d/compile/jvm/scala/isolated-analysis/ will contain a bunch of per-target analysis files. Those files can be anonymized (see anonymize_analysis.py), ideally with some non-ASCII words thrown in (as explained there), and then you can point this function to those anonymized files by setting ZINC_ANALYSIS_TEST_DATA_SOURCE=<dir> in the environment and running this test. Note: Yes, it's slightly problematic that we're using the very code we're testing to generate the test inputs. Hence the need to spot-check for obvious pathologies. """ original_splits_dir = os.environ.get(_TEST_DATA_SOURCE_ENV_VAR) canonical_dir = os.path.join(original_splits_dir, 'canonical') safe_rmtree(canonical_dir) os.mkdir(canonical_dir) original_split_filenames = [f.decode('utf-8') for f in os.listdir(original_splits_dir)] original_splits_files = [os.path.join(original_splits_dir, f) for f in original_split_filenames if f.endswith('.analysis')] parser = ZincAnalysisParser() original_split_analyses = [parser.parse_from_path(f) for f in original_splits_files] merged_analysis = ZincAnalysis.merge(original_split_analyses) merged_analysis.write_to_path(os.path.join(canonical_dir, 'all.merged.analysis')) # Split the merged analysis back to individual analyses. sources_per_analysis = [a.underlying_analysis.stamps.sources.keys() for a in original_split_analyses] split_analyses = merged_analysis.split(sources_per_analysis) for original_split_file, split_analysis in zip(original_splits_files, split_analyses): outpath = os.path.join(canonical_dir, os.path.basename(original_split_file)) split_analysis.write_to_path(outpath) print('Wrote canonical analysis data to {}'.format(canonical_dir))
def main(): """Anonymize a set of analysis files using the same replacements in all of them. This maintains enough consistency to make splitting/merging tests realistic. To run: ./pants py src/python/pants/backend/jvm/tasks/jvm_compile:anonymize_zinc_analysis \ <wordfile> <classes dir in analysis files> <analysis file glob 1> <analysis file glob 2> ... """ word_file = sys.argv[1] classes_dir = sys.argv[2] analysis_files = list( itertools.chain.from_iterable([glob.glob(p) for p in sys.argv[3:]])) with open(word_file, 'r') as infile: word_list = infile.read().split() anonymizer = Anonymizer(word_list) for analysis_file in analysis_files: analysis = ZincAnalysisParser(classes_dir).parse_from_path( analysis_file) analysis.anonymize(anonymizer) output_dir = os.path.join(os.path.dirname(analysis_file), 'anon') safe_mkdir(output_dir) anonymized_filename = anonymizer.convert( os.path.basename(analysis_file)) analysis.write_to_path(os.path.join(output_dir, anonymized_filename)) anonymizer.check_for_comprehensiveness()
def test_analysis_files(self): classes_dir = '/Users/kermit/src/acme.web/.pants.d/scalac/classes/' parser = ZincAnalysisParser(classes_dir) with temporary_dir() as tmpdir: # Extract analysis files from tarball. analysis_tarball = os.path.join(os.path.dirname(__file__), 'testdata', 'analysis.tar.bz2') analysis_dir = os.path.join(tmpdir, 'orig') print('Extracting %s to %s' % (analysis_tarball, analysis_dir)) os.mkdir(analysis_dir) with contextlib.closing(tarfile.open(analysis_tarball, 'r:bz2')) as tar: tar.extractall(analysis_dir) # Parse them. analysis_files = [ os.path.join(analysis_dir, f) for f in os.listdir(analysis_dir) if f.endswith('.analysis') ] num_analyses = len(analysis_files) def parse(f): return parser.parse_from_path(f) analyses = self._time(lambda: [parse(f) for f in analysis_files], 'Parsed %d files' % num_analyses) # Write them back out individually. writeout_dir = os.path.join(tmpdir, 'write') os.mkdir(writeout_dir) def write(file_name, analysis): outpath = os.path.join(writeout_dir, file_name) analysis.write_to_path(outpath) def _write_all(): for analysis_file, analysis in zip(analysis_files, analyses): write(os.path.basename(analysis_file), analysis) self._time(_write_all, 'Wrote %d files' % num_analyses) # Merge them. merged_analysis = self._time(lambda: ZincAnalysis.merge(analyses), 'Merged %d files' % num_analyses) # Write merged analysis to file. merged_analysis_path = os.path.join(tmpdir, 'analysis.merged') self._time( lambda: merged_analysis.write_to_path(merged_analysis_path), 'Wrote merged analysis to %s' % merged_analysis_path) # Read merged analysis from file. merged_analysis2 = self._time( lambda: parser.parse_from_path(merged_analysis_path), 'Read merged analysis from %s' % merged_analysis_path) # Split the merged analysis back to individual analyses. sources_per_analysis = [a.stamps.sources.keys() for a in analyses] split_analyses = self._time( lambda: merged_analysis2.split(sources_per_analysis, catchall=True), 'Split back into %d analyses' % num_analyses) self.assertEquals(num_analyses + 1, len(split_analyses)) # +1 for the catchall. catchall_analysis = split_analyses[-1] # We expect an empty catchall. self.assertEquals(0, len(catchall_analysis.stamps.sources)) # Diff the original analyses and the split ones. # Write the split to the tmpdir, for ease of debugging on failure. splits_dir = os.path.join(tmpdir, 'splits') os.mkdir(splits_dir) for analysis_file, analysis, split_analysis in zip( analysis_files, analyses, split_analyses): outfile_path = os.path.join(splits_dir, os.path.basename(analysis_file)) split_analysis.write_to_path(outfile_path) diffs = analysis.diff(split_analysis) self.assertEquals(analysis, split_analysis, ''.join([str(diff) for diff in diffs])) print('Total time: %f seconds' % self.total_time)
def create_analysis_tools(self): return AnalysisTools(self.context, ZincAnalysisParser(self._classes_dir), ZincAnalysis)
def test_analysis_files(self): if os.environ.get(_TEST_DATA_SOURCE_ENV_VAR): print('\n>>>>>>>>> {} set: skipping test, generating canonical test data instead.'.format( _TEST_DATA_SOURCE_ENV_VAR)) self._generate_testworthy_splits() return parser = ZincAnalysisParser() with temporary_dir() as tmpdir: analysis_dir = os.path.join(os.path.dirname(__file__), 'testdata', 'complex') # Parse analysis files. analysis_files = [os.path.join(analysis_dir, f) for f in os.listdir(analysis_dir) if f.endswith('.analysis') and not f.endswith('.merged.analysis')] num_analyses = len(analysis_files) def parse(f): return parser.parse_from_path(f) analyses = self._time(lambda: [parse(f) for f in analysis_files], 'Parsed %d files' % num_analyses) # Get the right exception on a busted file. truncated_dir = os.path.join(tmpdir, 'truncated') os.mkdir(truncated_dir) f = os.path.join(truncated_dir, os.path.basename(analysis_files[0])) shutil.copy(analysis_files[0], f) with open(f, 'r+b') as truncated: truncated.seek(-150, os.SEEK_END) truncated.truncate() with self.assertRaises(ParseError): parse(f) # Write them back out individually. writeout_dir = os.path.join(tmpdir, 'write') os.mkdir(writeout_dir) def write(file_name, analysis): outpath = os.path.join(writeout_dir, file_name) analysis.write_to_path(outpath) def _write_all(): for analysis_file, analysis in zip(analysis_files, analyses): write(os.path.basename(analysis_file), analysis) self._time(_write_all, 'Wrote %d files' % num_analyses) # Merge them. merged_analysis = self._time(lambda: ZincAnalysis.merge(analyses), 'Merged %d files' % num_analyses) # Write merged analysis to file. merged_analysis_path = os.path.join(tmpdir, 'merged.analysis') self._time(lambda: merged_analysis.write_to_path(merged_analysis_path), 'Wrote merged analysis to %s' % merged_analysis_path) # Read merged analysis from file. merged_analysis2 = self._time(lambda: parser.parse_from_path(merged_analysis_path), 'Read merged analysis from %s' % merged_analysis_path) # Read the expected merged analysis from file. expected_merged_analysis_path = os.path.join(analysis_dir, 'all.merged.analysis') expected_merged_analysis = self._time( lambda: parser.parse_from_path(expected_merged_analysis_path), 'Read expected merged analysis from %s' % expected_merged_analysis_path) # Compare the merge result with the re-read one. diffs = merged_analysis.diff(merged_analysis2) self.assertEquals(merged_analysis, merged_analysis2, ''.join( [unicode(diff) for diff in diffs])) # Compare the merge result with the expected. diffs = expected_merged_analysis.diff(merged_analysis2) self.assertEquals(expected_merged_analysis, merged_analysis2, ''.join( [unicode(diff) for diff in diffs])) # Split the merged analysis back to individual analyses. sources_per_analysis = [a.sources() for a in analyses] split_analyses = self._time(lambda: merged_analysis2.split( sources_per_analysis, catchall=True), 'Split back into %d analyses' % num_analyses) self.assertEquals(num_analyses + 1, len(split_analyses)) # +1 for the catchall. catchall_analysis = split_analyses[-1] # We expect an empty catchall. self.assertEquals(0, len(catchall_analysis.underlying_analysis.stamps.sources)) # Diff the original analyses and the split ones. # Write the split to the tmpdir, for ease of debugging on failure. splits_dir = os.path.join(tmpdir, 'splits') os.mkdir(splits_dir) for analysis_file, analysis, split_analysis in zip(analysis_files, analyses, split_analyses): outfile_path = os.path.join(splits_dir, os.path.basename(analysis_file)) split_analysis.write_to_path(outfile_path) diffs = analysis.diff(split_analysis) # Note that it's not true in general that merging splits and then splitting them back out # should yield the exact same analysis. Some small differences can happen. For example: # splitA may have an external src->class on a class from a source file in splitB; When # merging, that becomes a src->src dependency; And when splitting back out that src # dependency becomes a dependency on a representative class, which may not be # the original class SplitA depended on. # # This comparison works here only because we've taken care to prepare test data for which # it should hold. See _generate_testworthy_splits below for how to do so. self.assertEquals(analysis, split_analysis, ''.join([unicode(diff) for diff in diffs])) print('Total time: %f seconds' % self.total_time)
def create_analysis_tools(self): return AnalysisTools(self.context.java_home, self.ivy_cache_dir, ZincAnalysisParser(self._classes_dir), ZincAnalysis)
def test_analysis_files(self): classes_dir = '/Users/kermit/src/acme.web/.pants.d/scalac/classes/' parser = ZincAnalysisParser(classes_dir) with temporary_dir() as tmpdir: # Extract analysis files from tarball. analysis_tarball = os.path.join(os.path.dirname(__file__), 'testdata', 'analysis.tar.bz2') analysis_dir = os.path.join(tmpdir, 'orig') print('Extracting %s to %s' % (analysis_tarball, analysis_dir)) os.mkdir(analysis_dir) with contextlib.closing(tarfile.open(analysis_tarball, 'r:bz2')) as tar: tar.extractall(analysis_dir) # Parse them. analysis_files = [os.path.join(analysis_dir, f) for f in os.listdir(analysis_dir) if f.endswith('.analysis')] num_analyses = len(analysis_files) def parse(f): return parser.parse_from_path(f) analyses = self._time(lambda: [parse(f) for f in analysis_files], 'Parsed %d files' % num_analyses) # Write them back out individually. writeout_dir = os.path.join(tmpdir, 'write') os.mkdir(writeout_dir) def write(file_name, analysis): outpath = os.path.join(writeout_dir, file_name) analysis.write_to_path(outpath) def _write_all(): for analysis_file, analysis in zip(analysis_files, analyses): write(os.path.basename(analysis_file), analysis) self._time(_write_all, 'Wrote %d files' % num_analyses) # Merge them. merged_analysis = self._time(lambda: ZincAnalysis.merge(analyses), 'Merged %d files' % num_analyses) # Write merged analysis to file. merged_analysis_path = os.path.join(tmpdir, 'analysis.merged') self._time(lambda: merged_analysis.write_to_path(merged_analysis_path), 'Wrote merged analysis to %s' % merged_analysis_path) # Read merged analysis from file. merged_analysis2 = self._time(lambda: parser.parse_from_path(merged_analysis_path), 'Read merged analysis from %s' % merged_analysis_path) # Split the merged analysis back to individual analyses. sources_per_analysis = [a.stamps.sources.keys() for a in analyses] split_analyses = self._time(lambda: merged_analysis2.split(sources_per_analysis, catchall=True), 'Split back into %d analyses' % num_analyses) self.assertEquals(num_analyses + 1, len(split_analyses)) # +1 for the catchall. catchall_analysis = split_analyses[-1] # We expect an empty catchall. self.assertEquals(0, len(catchall_analysis.stamps.sources)) # Diff the original analyses and the split ones. # Write the split to the tmpdir, for ease of debugging on failure. splits_dir = os.path.join(tmpdir, 'splits') os.mkdir(splits_dir) for analysis_file, analysis, split_analysis in zip(analysis_files, analyses, split_analyses): outfile_path = os.path.join(splits_dir, os.path.basename(analysis_file)) split_analysis.write_to_path(outfile_path) diffs = analysis.diff(split_analysis) self.assertEquals(analysis, split_analysis, ''.join(diffs)) print('Total time: %f seconds' % self.total_time)
def create_analysis_tools(self): return AnalysisTools(self.context.java_home, ZincAnalysisParser(), ZincAnalysis)
def create_analysis_tools(self): return AnalysisTools(DistributionLocator.cached().real_home, ZincAnalysisParser(), ZincAnalysis)