예제 #1
0
def main():
    """Anonymize a set of analysis files using the same replacements in all of them.

  This maintains enough consistency to make splitting/merging tests realistic.
  In particular, it preserves dictionary order, so that representative class selection
  is consistent after anonymization.

  To run:

  ./pants run src/python/pants/backend/jvm/tasks/jvm_compile:anonymize_zinc_analysis -- \
    <wordfile> <analysis file glob 1> <analysis file glob 2> ...

  Output will be in a directory called 'anon' under the directory of each input analysis file.

  An easy way to generate a wordfile is to download SCOWL (http://wordlist.aspell.net/) and look
  at final/english-words.*.  A good wordfile can be had thus:

  for f in english-words.*; do cat $f >> wordfile; done
  egrep '^[a-z]{4}[a-z]*$' wordfile > wordfile.filtered

  To throw some non-ASCII characters into the mix, try e.g.,

  cat wordfile.filtered | tr a ā > wordfile.filtered.utf8

  If you copy-paste the command above into an OS X terminal, it'll do the right thing, assuming
  your terminal uses utf-8 encoding.

  Note that the larger the number at the end of the filename the rarer the words in it, so if you
  want to avoid rare words, manually cat the lowest few files into wordfile, until you have enough
  words.
  """
    word_file = sys.argv[1]
    analysis_files = list(
        itertools.chain.from_iterable([glob.glob(p) for p in sys.argv[2:]]))

    with open(word_file, 'r') as infile:
        word_list = [w.decode('utf-8') for w in infile.read().split()]

    # First pass: Capture all words that need translating.
    translation_capturer = TranslationCapturer(word_list, strict=True)
    for analysis_file in analysis_files:
        analysis = ZincAnalysisParser().parse_from_path(analysis_file)
        analysis.translate(translation_capturer)
        translation_capturer.convert(os.path.basename(analysis_file))
    translation_capturer.check_for_comprehensiveness()

    # Second pass: Actually translate, in order-preserving fashion.
    anonymizer = translation_capturer.get_order_preserving_anonymizer()
    for analysis_file in analysis_files:
        analysis = ZincAnalysisParser().parse_from_path(analysis_file)
        analysis.translate(anonymizer)
        output_dir = os.path.join(os.path.dirname(analysis_file), 'anon')
        safe_mkdir(output_dir)
        anonymized_filename = anonymizer.convert(
            os.path.basename(analysis_file))
        analysis.write_to_path(os.path.join(output_dir, anonymized_filename))
    anonymizer.check_for_comprehensiveness()
예제 #2
0
def main():
  """Anonymize a set of analysis files using the same replacements in all of them.

  This maintains enough consistency to make splitting/merging tests realistic.
  In particular, it preserves dictionary order, so that representative class selection
  is consistent after anonymization.

  To run:

  ./pants run src/python/pants/backend/jvm/tasks/jvm_compile:anonymize_zinc_analysis -- \
    <wordfile> <analysis file glob 1> <analysis file glob 2> ...

  Output will be in a directory called 'anon' under the directory of each input analysis file.

  An easy way to generate a wordfile is to download SCOWL (http://wordlist.aspell.net/) and look
  at final/english-words.*.  A good wordfile can be had thus:

  for f in english-words.*; do cat $f >> wordfile; done
  egrep '^[a-z]{4}[a-z]*$' wordfile > wordfile.filtered

  To throw some non-ASCII characters into the mix, try e.g.,

  cat wordfile.filtered | tr a ā > wordfile.filtered.utf8

  If you copy-paste the command above into an OS X terminal, it'll do the right thing, assuming
  your terminal uses utf-8 encoding.

  Note that the larger the number at the end of the filename the rarer the words in it, so if you
  want to avoid rare words, manually cat the lowest few files into wordfile, until you have enough
  words.
  """
  word_file = sys.argv[1]
  analysis_files = list(itertools.chain.from_iterable([glob.glob(p) for p in sys.argv[2:]]))

  with open(word_file, 'r') as infile:
    word_list = [w.decode('utf-8') for w in infile.read().split()]

  # First pass: Capture all words that need translating.
  translation_capturer = TranslationCapturer(word_list, strict=True)
  for analysis_file in analysis_files:
    analysis = ZincAnalysisParser().parse_from_path(analysis_file)
    analysis.translate(translation_capturer)
    translation_capturer.convert(os.path.basename(analysis_file))
  translation_capturer.check_for_comprehensiveness()

  # Second pass: Actually translate, in order-preserving fashion.
  anonymizer = translation_capturer.get_order_preserving_anonymizer()
  for analysis_file in analysis_files:
    analysis = ZincAnalysisParser().parse_from_path(analysis_file)
    analysis.translate(anonymizer)
    output_dir = os.path.join(os.path.dirname(analysis_file), 'anon')
    safe_mkdir(output_dir)
    anonymized_filename = anonymizer.convert(os.path.basename(analysis_file))
    analysis.write_to_path(os.path.join(output_dir, anonymized_filename))
  anonymizer.check_for_comprehensiveness()