示例#1
0
def evaluate_and_save_stats(input_file, file_name, output_dir, helpers,
                            includeSemanticSimilarity):
    measures = defaultdict(dict)
    data = ''
    with open(input_file) as data_file:
        data = json.load(data_file)
    stopwords = helpers['stopwords']
    spellcheck = helpers['spellcheck']
    texts = merge_texts(data)
    original_text = texts[0]
    obfuscated_text = texts[1]
    document_stats_original = DocumentStats(original_text, spellcheck,
                                            stopwords)
    document_stats_obfuscation = DocumentStats(obfuscated_text, spellcheck,
                                               stopwords)

    measures['safety_sentenceLengthChange'] = measure_rate_change(
        document_stats_original.average_sentence_length,
        document_stats_obfuscation.average_sentence_length)
    measures['safety_nounRateChange'] = measure_rate_change(
        document_stats_original.average_noun_rate,
        document_stats_obfuscation.average_noun_rate)
    measures['safety_verbRateChange'] = measure_rate_change(
        document_stats_original.average_verb_rate,
        document_stats_obfuscation.average_verb_rate)
    measures['safety_adjectivesRateChange'] = measure_rate_change(
        document_stats_original.average_adj_rate,
        document_stats_obfuscation.average_adj_rate)
    measures['safety_adverbsRateChange'] = measure_rate_change(
        document_stats_original.average_adv_rate,
        document_stats_obfuscation.average_adv_rate)
    measures['safety_punctuationRateChange'] = measure_rate_change(
        document_stats_original.average_punct_rate,
        document_stats_obfuscation.average_punct_rate)
    measures['safety_stopWordsRateChange'] = measure_rate_change(
        document_stats_original.stop_words_ratio,
        document_stats_obfuscation.stop_words_ratio)
    measures['safety_uniqueWordsRateChange'] = measure_rate_change(
        document_stats_original.unique_words_ratio,
        document_stats_obfuscation.unique_words_ratio)
    measures['safety_capitalizedWordsRateChange'] = measure_rate_change(
        document_stats_original.words_all_capital_letters_ratio,
        document_stats_obfuscation.words_all_capital_letters_ratio)
    measures['safety_sequenceDistance'] = similarity.simple_distance(
        original_text, obfuscated_text)

    if includeSemanticSimilarity:
        measures['soundness_semanticSimilarity'] = average_semantic_similarity(
            data)

    for key, value in measures.items():
        measure = \
        'measure {\n' + \
        '  key  : "' + str(key) + '"\n' + \
        '  value: "' + str(math.fabs(round(value, 4))) + '"\n' + \
        '}\n'
        #print(measure)
        output_writer.ensure_directory_exists(output_dir + '/' + file_name)
        output_writer.write_text_to_file(
            measure, output_dir + '/' + file_name + '/evaluation_result.txt')
示例#2
0
def save_stats_for_all_files(input_dir, output_dir):
    # Make sure the given output directory exists
    output_writer.ensure_directory_exists(output_dir)
    # Create helper objects in order to not create them for every file
    helpers = {'spellcheck': Spellcheck(), 'stopwords': StopWords()}
    # Read all files in the given directory and save stats in one file
    with open(output_dir + '\stats.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow([
            'filename', 'attr_name', 'original', 'obfuscated', 'change',
            'change_rate', 'average'
        ])
        for file_name in os.listdir(input_dir):
            print(file_name)
            doc_stats = get_document_stats(input_dir + '/' + file_name,
                                           helpers)
            original_stats = doc_stats[0]
            obfuscation_stats = doc_stats[1]
            for attr_name, average_value in AverageStats.EVALUATION_MEASURES.items(
            ):
                orig = getattr(original_stats, attr_name)
                obf = getattr(obfuscation_stats, attr_name)
                change = obf - orig
                change_rate = change / orig
                csv_writer.writerow([
                    file_name, attr_name,
                    str(orig),
                    str(obf),
                    str(change),
                    str(change_rate),
                    str(average_value)
                ])
示例#3
0
def evaluate_and_save_stats_all_files(input_dir, output_dir,
                                      includeSemanticSimilarity):
    # Make sure the given output directory exists
    output_writer.ensure_directory_exists(output_dir)
    # Create helper objects in order to not create them for every file
    helpers = {'spellcheck': Spellcheck(), 'stopwords': StopWords()}
    for file_name in os.listdir(input_dir):
        evaluate_and_save_stats(
            input_dir + '/' + file_name + '/obfuscation.json', file_name,
            output_dir, helpers, includeSemanticSimilarity)
def obfuscate_and_save_file(input_file_path, dir_name, output_dir, helpers):
    print('Obfuscating file ' + input_file_path)
    # Read the input text
    input_text = input_reader.read_input_file(input_file_path)
    # Get array of split parts of the input text
    file_parts = text_utils.split_text(input_text)
    file_parts = obfuscator.obfuscate_all(input_text, file_parts, helpers)
    output_writer.ensure_directory_exists(output_dir + '/' + dir_name)
    output_file = output_dir + '/' + dir_name + '/obfuscation.json'
    print('Output: ' + output_file)
    output_writer.write_output_file(output_file, file_parts)
def obfuscate_and_save_all_files(input_dir, output_dir):
    print('Parsing input...')

    helpers = {
        'stopwords': StopWords(),
        'spellcheck': SpellChecker(
        ),  # It seems there is a name clash with another module, so needed to be renamed for this file
        'errorCreator': ErrorCreator(),
        'punctuation': Punctuation(),
        'britishToAmerican': BritishAmericanNormalization(),
        'fillerWords': FillerWords(),
        'symbolReplacement': SymbolReplacement(),
        'paraphraseCorpus': ParaphraseCorpus()
    }

    # Make sure the given output directory exists
    output_writer.ensure_directory_exists(output_dir)
    # Read all files in the given directory and for each execute obfuscation
    for dir_name in os.listdir(input_dir):
        if dir_name[0] != '.':  #skip hidden files and folders
            print(dir_name)
            obfuscate_and_save_file(
                input_dir + '/' + dir_name + '/original.txt', dir_name,
                output_dir, helpers)