def __preprocess_input(input_file_path, whether_score): """Preprocess the input sentences to fit the format of lasertagger input. Args: input_file_path: the absolute path to the input file whether_score: whether scoring is needed. If scoring is needed, two columns are expected in the input file. Returns: sentences: a list of input sentences summaries: a list of summaries Raises: Exception: If scoring is required, but target is not found in the input file """ if not os.path.isfile(os.path.expanduser(input_file_path)): __clean_up() raise Exception("The input file does not exist") print("-------Cleaning inputs-------") tsv_file = open(input_file_path) read_tsv = csv.reader(tsv_file, delimiter="\t") sentences = [] summaries = [] for row in read_tsv: sentences.append(row[0]) if whether_score: try: summaries.append(row[1]) except IndexError: tsv_file.close() __clean_up() raise Exception( "Whether_score is true. Expected target but only found one column in the input." ) tsv_file.close() cleaned_sentences = preprocess_utils.text_strip(sentences) if whether_score: cleaned_summaries = preprocess_utils.text_strip(summaries) else: cleaned_summaries = cleaned_sentences cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) if whether_score: spaced_summaries = preprocess_utils.tokenize_with_space( cleaned_summaries) else: spaced_summaries = spaced_sentences preprocess_utils.delete_empty_entry(spaced_sentences, spaced_summaries) return spaced_sentences, spaced_summaries
def main(args): """ Preprocess the Reddit dataset. Args: args: Command line arguments Raises: ValueError when the number of samples is specified to be negative """ num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise ValueError("Number of samples must be non-negative integers") if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)): ds = tfds.load('reddit_tifu', split='train', shuffle_files=True) sentences = [] summaries = [] for row in ds: summary = row["title"] sentence = row["tldr"] sentences.append(sentence.numpy().decode('UTF-8')) summaries.append(summary.numpy().decode('UTF-8')) cleaned_sentences = preprocess_utils.text_strip(sentences) cleaned_summaries = preprocess_utils.text_strip(summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") else: print("-------Preprocessed data exists. Now splitting dataset.-------") print("-------Now splitting dataset.-------") preprocess_utils.split_dataset(TRAIN_FILE_PATH, TUNE_FILE_PATH, VALID_FILE_PATH, PREPROCESSED_FILE_PATH, num_of_tuning_sam, num_of_valid_sam, whether_shuffle_entire_set=False, whether_shuffle_individual_file=True)
def test_text_strip(self): # test remove escape characters test_case = ["\t", "\r", "\n", "\t" * 3, "\r" * 3, "\n" * 7] correct_ans = [" "] * 6 result = preprocess_utils.text_strip(test_case) self.assertEqual(result, correct_ans) # test removing redundant special characters special_chars = ["_", "+", "-", "~", ":", "."] test_case = [] correct_ans = [] for char in special_chars: test_case.append("text" + char + "text") test_case.append("text" + char * 2 + "text") correct_ans += (["text" + char + "text"]) * 2 result = preprocess_utils.text_strip(test_case) self.assertEqual(result, correct_ans) # test removing -, :, and _ at end or beginning of string (not in the middle) special_chars = ["-", ":", "_"] test_case = [] correct_ans = [] for char in special_chars: test_case.append("text" + char + "text") test_case.append(char + "text") test_case.append("text" + char) correct_ans += (["text" + char + "text"]) correct_ans += (["text"]) * 2 result = preprocess_utils.text_strip(test_case) self.assertEqual(result, correct_ans) # test removing ~ at end of string (not in the middle or beginning) test_case = ["text~", "text ~ text", "text~text", "~text"] correct_ans = ['text', 'text ~ text', 'text~text', '~text'] result = preprocess_utils.text_strip(test_case) self.assertEqual(result, correct_ans) # test removing . at beginning of string (not in the middle or end) test_case = ["text.", "text . text", "text.text", ".text"] correct_ans = ['text.', 'text . text', 'text.text', 'text'] result = preprocess_utils.text_strip(test_case) self.assertEqual(result, correct_ans)
def main(args): """Preprocess the news dataset. Args: args: command line arguments Raises: ValueError when dataset cannot be found in the path provided """ num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise Exception("Number of samples must be non-negative integers") data_file_1 = args.news_summary_path data_file_2 = args.news_summary_more_path if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)): if not os.path.isfile(os.path.expanduser(data_file_1)): raise ValueError( "Cannot find" + os.path.expanduser(data_file_1) + ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary" ) if not os.path.isfile(os.path.expanduser(data_file_2)): raise ValueError( "Cannot find" + os.path.expanduser(data_file_2) + ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary" ) dataset1 = (pd.read_csv(data_file_1, encoding='iso-8859-1')).iloc[:, 0:6].copy() dataset2 = (pd.read_csv(data_file_2, encoding='iso-8859-1')).iloc[:, 0:2].copy() dataset = pd.DataFrame() dataset['sentences'] = pd.concat([dataset1['text'], dataset2['text']], ignore_index=True) dataset['summaries'] = pd.concat( [dataset1['headlines'], dataset2['headlines']], ignore_index=True) cleaned_sentences = preprocess_utils.text_strip(dataset['sentences']) cleaned_summaries = preprocess_utils.text_strip(dataset['summaries']) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") else: print("-------Preprocessed data exists. Now splitting dataset.-------") print("-------Now splitting dataset.-------") preprocess_utils.split_dataset(TRAIN_FILE_PATH, TUNE_FILE_PATH, VALID_FILE_PATH, PREPROCESSED_FILE_PATH, num_of_tuning_sam, num_of_valid_sam, whether_shuffle_entire_set=False, whether_shuffle_individual_file=True)
def main(args): """Preprocess the Microsoft text summarization dataset. Args: args: command line arguments. """ data_dir = args.raw_data_dir if not os.path.isdir(os.path.expanduser(data_dir)): raise Exception("Data directory not found.") num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise Exception("Number of samples must be non-negative integers") train_data_file = data_dir + "/train.tsv" train_sentences, train_summaries, train_grammar, train_meaning = __process_file( train_data_file) test_data_file = data_dir + "/test.tsv" test_sentences, test_summaries, test_grammar, test_meaning = __process_file( test_data_file) valid_data_file = data_dir + "/valid.tsv" valid_sentences, valid_summaries, valid_grammar, valid_meaning = __process_file( valid_data_file) tot_sentences = train_sentences + test_sentences + valid_sentences tot_summaries = train_summaries + test_summaries + valid_summaries tot_grammar = train_grammar + test_grammar + valid_grammar tot_meaning = train_meaning + test_meaning + valid_meaning cleaned_sentences = preprocess_utils.text_strip(tot_sentences) cleaned_summaries = preprocess_utils.text_strip(tot_summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([ spaced_sentences[i], spaced_summaries[i], tot_grammar[i], tot_meaning[i] ]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") print("-------Now splitting dataset.-------") if num_of_tuning_sam + num_of_valid_sam > len(spaced_sentences): raise Exception( "The number of tuning and validation samples together exceeds the total sample size of " + str(len(sentences))) sentence_shuffled = [] summary_shuffled = [] grammar_shuffled = [] meaning_shuffled = [] tune_shuffled = list(range(num_of_tuning_sam)) random.shuffle(tune_shuffled) valid_shuffled = list( range(num_of_tuning_sam, num_of_tuning_sam + num_of_valid_sam)) random.shuffle(valid_shuffled) train_shuffled = list( range(num_of_tuning_sam + num_of_valid_sam, len(spaced_sentences))) random.shuffle(train_shuffled) index_shuffled = tune_shuffled + valid_shuffled + train_shuffled for i in index_shuffled: sentence_shuffled.append(spaced_sentences[i]) summary_shuffled.append(spaced_summaries[i]) grammar_shuffled.append(tot_grammar[i]) meaning_shuffled.append(tot_meaning[i]) tuning_range = range(num_of_tuning_sam) valid_range = range(num_of_tuning_sam, num_of_tuning_sam + num_of_valid_sam) training_range = range(num_of_tuning_sam + num_of_valid_sam, len(summary_shuffled)) output_for_grammar_files = [summary_shuffled, grammar_shuffled] __write_to_file(TUNE_FILE_PATH_GRAMMAR, tuning_range, output_for_grammar_files) __write_to_file(VALID_FILE_PATH_GRAMMAR, valid_range, output_for_grammar_files) __write_to_file(TRAIN_FILE_PATH_GRAMMAR, training_range, output_for_grammar_files) output_for_meaning_files = [ sentence_shuffled, summary_shuffled, meaning_shuffled ] __write_to_file(TUNE_FILE_PATH_MEANING, tuning_range, output_for_meaning_files) __write_to_file(VALID_FILE_PATH_MEANING, valid_range, output_for_meaning_files) __write_to_file(TRAIN_FILE_PATH_MEANING, training_range, output_for_meaning_files)
def main(args): """Preprocess the Microsoft text summarization dataset. Args: args: command line arguments. Raises: ValueError when the number of samples is negative. """ data_dir = args.raw_data_dir if not os.path.isdir(os.path.expanduser(data_dir)): raise Exception("Data directory not found.") num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise ValueError("Number of samples must be non-negative integers") if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)): train_data_file = data_dir + "/train.tsv" train_sentences, train_summaries, train_ratings, train_excluded = __process_file( train_data_file) test_data_file = data_dir + "/test.tsv" test_sentences, test_summaries, test_ratings, test_excluded = __process_file( test_data_file) valid_data_file = data_dir + "/valid.tsv" valid_sentences, valid_summaries, valid_ratings, valid_excluded = __process_file( valid_data_file) tot_sentences = train_sentences + test_sentences + valid_sentences tot_summaries = train_summaries + test_summaries + valid_summaries tot_ratings = train_ratings + test_ratings + valid_ratings tot_excluded = train_excluded + test_excluded + valid_excluded cleaned_sentences = preprocess_utils.text_strip(tot_sentences) cleaned_summaries = preprocess_utils.text_strip(tot_summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) print("Total number of excluded sample is", tot_excluded) preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space( cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space( cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") else: print("-------Preprocessed data exists. Now splitting dataset.-------") print("-------Now splitting dataset.-------") preprocess_utils.split_dataset(TRAIN_FILE_PATH, TUNE_FILE_PATH, VALID_FILE_PATH, PREPROCESSED_FILE_PATH, num_of_tuning_sam, num_of_valid_sam, whether_shuffle_entire_set=False, whether_shuffle_individual_file=True)
def __format_data(): """ Format the dataset and clean up special characters. Returns: cleaned_sentences: a list of cleaned input sentences cleaned_summaries: a list of cleaned summaries corresponding to the input sentences """ print("-------Processing original sentences-------") for i in range(1, 11): subprocess.call('cat sent-comp.train' + str(i).zfill(2) + '.json | grep \'"sentence":\' > ~/' + TEMP_FOLDER_NAME + '/train' + str(i) + '.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) subprocess.call('cat comp-data.eval.json | grep \'"sentence":\' > ~/' + TEMP_FOLDER_NAME + '/train11.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) sentences = [] for i in range(1, 12): file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str( i) + '.txt' f = open(file_name, "r") odd_line = True for line in f: if odd_line: sentences.append(line[17:-3]) odd_line = not odd_line f.close() cleaned_sentences = preprocess_utils.text_strip(sentences) print("-------Processing summaries-------") for i in range(1, 11): subprocess.call('cat sent-comp.train' + str(i).zfill(2) + '.json | grep \'"headline":\' > ~/' + TEMP_FOLDER_NAME + '/train' + str(i) + '.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) subprocess.call('cat comp-data.eval.json | grep \'"headline":\' > ~/' + TEMP_FOLDER_NAME + '/train11.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) summaries = [] for i in range(1, 12): file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str( i) + '.txt' f = open(file_name, "r") for line in f: summaries.append(line[15:-3]) f.close() cleaned_summaries = preprocess_utils.text_strip(summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) return cleaned_sentences, cleaned_summaries
def main(args): """Preprocess the CoLA grammar dataset. Args: args: command line arguments. """ data_file = os.path.expanduser(args.raw_data_file) if not os.path.isfile(data_file): raise Exception("Data file not found.") sentences_positive = [] sentences_negative = [] with open(data_file) as tsv_file: read_tsv = csv.reader(tsv_file, delimiter="\t") for line in read_tsv: if int(line[1]) == 1: sentences_positive.append(line[3]) else: sentences_negative.append(line[3]) cleaned_sentences_positive = preprocess_utils.text_strip( sentences_positive) cleaned_sentences_negative = preprocess_utils.text_strip( sentences_negative) print("Number of samples is", len(cleaned_sentences_positive) + len(cleaned_sentences_negative)) print("Number of incorrect sample is", len(cleaned_sentences_negative), "and number of correct sample is", len(cleaned_sentences_positive)) spaced_sentences_positive = preprocess_utils.tokenize_with_space( cleaned_sentences_positive) spaced_sentences_negative = preprocess_utils.tokenize_with_space( cleaned_sentences_negative) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for positive_sentence in spaced_sentences_positive: tsv_writer.writerow([positive_sentence, "1"]) for negative_sentence in spaced_sentences_negative: tsv_writer.writerow([negative_sentence, "0"]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") print("-------Now mixing dataset with the MS dataset.-------") MS_data_file = os.path.expanduser(args.MS_data_file) if not os.path.isfile(MS_data_file): raise Exception("Microsoft data file not found.") MS_sentences = [] MS_ratings = [] number_of_MS_samples_in_each_category = [0, 0] with open(MS_data_file) as tsv_file: read_tsv = csv.reader(tsv_file, delimiter="\t") for line in read_tsv: MS_sentences.append(line[0]) MS_ratings.append(int(line[1])) number_of_MS_samples_in_each_category[int(line[1])] += 1 max_negative_rate = (number_of_MS_samples_in_each_category[0] + len(cleaned_sentences_negative)) / \ (sum(number_of_MS_samples_in_each_category) + len(cleaned_sentences_negative)) min_negative_rate = (number_of_MS_samples_in_each_category[0] + len(cleaned_sentences_negative)) / \ (sum(number_of_MS_samples_in_each_category) + len(cleaned_sentences_positive) + len(cleaned_sentences_negative)) goal_percentage = args.goal_percentage_of_neg_samples if goal_percentage is None: number_of_pos_sample_to_include = 0 else: if goal_percentage > max_negative_rate: raise Exception( "The goal negative sample percentage is greater than the largest" "possible value {:.2f}".format(max_negative_rate)) if goal_percentage < min_negative_rate: raise Exception( "The goal negative sample percentage is smaller than the smallest" "possible value {:.2f}".format(min_negative_rate)) number_of_pos_sample_to_include = int( (1 - goal_percentage) / goal_percentage * (len(cleaned_sentences_negative) + number_of_MS_samples_in_each_category[0]) - number_of_MS_samples_in_each_category[1]) print("------- Including", number_of_pos_sample_to_include, "samples from the cola dataset.") MS_sentences = MS_sentences + spaced_sentences_positive[0:number_of_pos_sample_to_include] + \ spaced_sentences_negative MS_ratings = MS_ratings + [1] * number_of_pos_sample_to_include + [ 0 ] * len(spaced_sentences_negative) actual_negative_rate = (number_of_MS_samples_in_each_category[0] + len(spaced_sentences_negative)) / \ (sum(number_of_MS_samples_in_each_category) + len(spaced_sentences_negative) + number_of_pos_sample_to_include) print("-------The percentage of negative sample is", "{:.2f}".format(actual_negative_rate), "-------") shuffled_index = list(range(len(MS_sentences))) random.shuffle(shuffled_index) with open(os.path.expanduser(MIXED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for index in shuffled_index: tsv_writer.writerow([MS_sentences[index], MS_ratings[index]]) print("-------", len(MS_sentences), "samples saved to", MIXED_FILE_PATH, "-------")