def evaluate_and_save_stats(input_file, file_name, output_dir, helpers, includeSemanticSimilarity): measures = defaultdict(dict) data = '' with open(input_file) as data_file: data = json.load(data_file) stopwords = helpers['stopwords'] spellcheck = helpers['spellcheck'] texts = merge_texts(data) original_text = texts[0] obfuscated_text = texts[1] document_stats_original = DocumentStats(original_text, spellcheck, stopwords) document_stats_obfuscation = DocumentStats(obfuscated_text, spellcheck, stopwords) measures['safety_sentenceLengthChange'] = measure_rate_change( document_stats_original.average_sentence_length, document_stats_obfuscation.average_sentence_length) measures['safety_nounRateChange'] = measure_rate_change( document_stats_original.average_noun_rate, document_stats_obfuscation.average_noun_rate) measures['safety_verbRateChange'] = measure_rate_change( document_stats_original.average_verb_rate, document_stats_obfuscation.average_verb_rate) measures['safety_adjectivesRateChange'] = measure_rate_change( document_stats_original.average_adj_rate, document_stats_obfuscation.average_adj_rate) measures['safety_adverbsRateChange'] = measure_rate_change( document_stats_original.average_adv_rate, document_stats_obfuscation.average_adv_rate) measures['safety_punctuationRateChange'] = measure_rate_change( document_stats_original.average_punct_rate, document_stats_obfuscation.average_punct_rate) measures['safety_stopWordsRateChange'] = measure_rate_change( document_stats_original.stop_words_ratio, document_stats_obfuscation.stop_words_ratio) measures['safety_uniqueWordsRateChange'] = measure_rate_change( document_stats_original.unique_words_ratio, document_stats_obfuscation.unique_words_ratio) measures['safety_capitalizedWordsRateChange'] = measure_rate_change( document_stats_original.words_all_capital_letters_ratio, document_stats_obfuscation.words_all_capital_letters_ratio) measures['safety_sequenceDistance'] = similarity.simple_distance( original_text, obfuscated_text) if includeSemanticSimilarity: measures['soundness_semanticSimilarity'] = average_semantic_similarity( data) for key, value in measures.items(): measure = \ 'measure {\n' + \ ' key : "' + str(key) + '"\n' + \ ' value: "' + str(math.fabs(round(value, 4))) + '"\n' + \ '}\n' #print(measure) output_writer.ensure_directory_exists(output_dir + '/' + file_name) output_writer.write_text_to_file( measure, output_dir + '/' + file_name + '/evaluation_result.txt')
def save_stats_for_all_files(input_dir, output_dir): # Make sure the given output directory exists output_writer.ensure_directory_exists(output_dir) # Create helper objects in order to not create them for every file helpers = {'spellcheck': Spellcheck(), 'stopwords': StopWords()} # Read all files in the given directory and save stats in one file with open(output_dir + '\stats.csv', 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([ 'filename', 'attr_name', 'original', 'obfuscated', 'change', 'change_rate', 'average' ]) for file_name in os.listdir(input_dir): print(file_name) doc_stats = get_document_stats(input_dir + '/' + file_name, helpers) original_stats = doc_stats[0] obfuscation_stats = doc_stats[1] for attr_name, average_value in AverageStats.EVALUATION_MEASURES.items( ): orig = getattr(original_stats, attr_name) obf = getattr(obfuscation_stats, attr_name) change = obf - orig change_rate = change / orig csv_writer.writerow([ file_name, attr_name, str(orig), str(obf), str(change), str(change_rate), str(average_value) ])
def evaluate_and_save_stats_all_files(input_dir, output_dir, includeSemanticSimilarity): # Make sure the given output directory exists output_writer.ensure_directory_exists(output_dir) # Create helper objects in order to not create them for every file helpers = {'spellcheck': Spellcheck(), 'stopwords': StopWords()} for file_name in os.listdir(input_dir): evaluate_and_save_stats( input_dir + '/' + file_name + '/obfuscation.json', file_name, output_dir, helpers, includeSemanticSimilarity)
def obfuscate_and_save_file(input_file_path, dir_name, output_dir, helpers): print('Obfuscating file ' + input_file_path) # Read the input text input_text = input_reader.read_input_file(input_file_path) # Get array of split parts of the input text file_parts = text_utils.split_text(input_text) file_parts = obfuscator.obfuscate_all(input_text, file_parts, helpers) output_writer.ensure_directory_exists(output_dir + '/' + dir_name) output_file = output_dir + '/' + dir_name + '/obfuscation.json' print('Output: ' + output_file) output_writer.write_output_file(output_file, file_parts)
def obfuscate_and_save_all_files(input_dir, output_dir): print('Parsing input...') helpers = { 'stopwords': StopWords(), 'spellcheck': SpellChecker( ), # It seems there is a name clash with another module, so needed to be renamed for this file 'errorCreator': ErrorCreator(), 'punctuation': Punctuation(), 'britishToAmerican': BritishAmericanNormalization(), 'fillerWords': FillerWords(), 'symbolReplacement': SymbolReplacement(), 'paraphraseCorpus': ParaphraseCorpus() } # Make sure the given output directory exists output_writer.ensure_directory_exists(output_dir) # Read all files in the given directory and for each execute obfuscation for dir_name in os.listdir(input_dir): if dir_name[0] != '.': #skip hidden files and folders print(dir_name) obfuscate_and_save_file( input_dir + '/' + dir_name + '/original.txt', dir_name, output_dir, helpers)