def test_delete_empty_entry(self): summaries = [] sentences = [] for i in range(3): summaries += ["text " + str(i)] sentences += ["sum " + str(i)] summaries += [" "] * 2 sentences = ["text"] * 5 sentences_cleaned, summaries_cleaned = preprocess_utils.delete_empty_entry( sentences, summaries) self.assertEqual(len(sentences_cleaned), 3) self.assertEqual(len(summaries_cleaned), 3) try: preprocess_utils.validate_dataset(sentences_cleaned, summaries_cleaned) except: self.fail("validate_dataset raised Exception unexpectedly!") for sentence in sentences_cleaned: self.assertEqual( summaries[sentences.index(sentence)], summaries_cleaned[sentences_cleaned.index(sentence)])
def main(args): """ Preprocess the Reddit dataset. Args: args: Command line arguments Raises: ValueError when the number of samples is specified to be negative """ num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise ValueError("Number of samples must be non-negative integers") if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)): ds = tfds.load('reddit_tifu', split='train', shuffle_files=True) sentences = [] summaries = [] for row in ds: summary = row["title"] sentence = row["tldr"] sentences.append(sentence.numpy().decode('UTF-8')) summaries.append(summary.numpy().decode('UTF-8')) cleaned_sentences = preprocess_utils.text_strip(sentences) cleaned_summaries = preprocess_utils.text_strip(summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") else: print("-------Preprocessed data exists. Now splitting dataset.-------") print("-------Now splitting dataset.-------") preprocess_utils.split_dataset(TRAIN_FILE_PATH, TUNE_FILE_PATH, VALID_FILE_PATH, PREPROCESSED_FILE_PATH, num_of_tuning_sam, num_of_valid_sam, whether_shuffle_entire_set=False, whether_shuffle_individual_file=True)
def __preprocess_input(input_file_path, whether_score): """Preprocess the input sentences to fit the format of lasertagger input. Args: input_file_path: the absolute path to the input file whether_score: whether scoring is needed. If scoring is needed, two columns are expected in the input file. Returns: sentences: a list of input sentences summaries: a list of summaries Raises: Exception: If scoring is required, but target is not found in the input file """ if not os.path.isfile(os.path.expanduser(input_file_path)): __clean_up() raise Exception("The input file does not exist") print("-------Cleaning inputs-------") tsv_file = open(input_file_path) read_tsv = csv.reader(tsv_file, delimiter="\t") sentences = [] summaries = [] for row in read_tsv: sentences.append(row[0]) if whether_score: try: summaries.append(row[1]) except IndexError: tsv_file.close() __clean_up() raise Exception( "Whether_score is true. Expected target but only found one column in the input." ) tsv_file.close() cleaned_sentences = preprocess_utils.text_strip(sentences) if whether_score: cleaned_summaries = preprocess_utils.text_strip(summaries) else: cleaned_summaries = cleaned_sentences cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) if whether_score: spaced_summaries = preprocess_utils.tokenize_with_space( cleaned_summaries) else: spaced_summaries = spaced_sentences preprocess_utils.delete_empty_entry(spaced_sentences, spaced_summaries) return spaced_sentences, spaced_summaries
def test_validate_dataset_unequal(self): sentences = ["text"] * 5 summaries = ["text"] * 7 with self.assertRaises(Exception): preprocess_utils.validate_dataset(sentences, summaries) sentences = ["text"] * 7 summaries = ["text"] * 7 try: preprocess_utils.validate_dataset(sentences, summaries) except: self.fail("validate_dataset raised Exception unexpectedly!")
def test_preprocess_input_without_scoring_and_only_one_rows(self): with open(TEMP_TESTING_FILE, "wt") as f: tsv_writer = csv.writer(f, delimiter='\t') for i in range(10): tsv_writer.writerow(["Sample" + str(i)]) sentences, summaries = preprocess_input( input_file_path=TEMP_TESTING_FILE, whether_score=False) preprocess_utils.validate_dataset(sentences, summaries) for i in range(10): self.assertEqual(sentences[i], "Sample" + str(i)) self.assertEqual(summaries[i], "Sample" + str(i))
def test_validate_dataset_empty(self): sentences = [" "] * 5 summaries = ["text"] * 5 with self.assertRaises(Exception): preprocess_utils.validate_dataset(sentences, summaries) summaries = [" "] * 5 sentences = ["text"] * 5 with self.assertRaises(Exception): preprocess_utils.validate_dataset(sentences, summaries) summaries = ["text"] * 3 + [" "] * 2 sentences = ["text"] * 5 with self.assertRaises(Exception): preprocess_utils.validate_dataset(sentences, summaries)
def main(args): """Preprocess the news dataset. Args: args: command line arguments Raises: ValueError when dataset cannot be found in the path provided """ num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise Exception("Number of samples must be non-negative integers") data_file_1 = args.news_summary_path data_file_2 = args.news_summary_more_path if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)): if not os.path.isfile(os.path.expanduser(data_file_1)): raise ValueError( "Cannot find" + os.path.expanduser(data_file_1) + ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary" ) if not os.path.isfile(os.path.expanduser(data_file_2)): raise ValueError( "Cannot find" + os.path.expanduser(data_file_2) + ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary" ) dataset1 = (pd.read_csv(data_file_1, encoding='iso-8859-1')).iloc[:, 0:6].copy() dataset2 = (pd.read_csv(data_file_2, encoding='iso-8859-1')).iloc[:, 0:2].copy() dataset = pd.DataFrame() dataset['sentences'] = pd.concat([dataset1['text'], dataset2['text']], ignore_index=True) dataset['summaries'] = pd.concat( [dataset1['headlines'], dataset2['headlines']], ignore_index=True) cleaned_sentences = preprocess_utils.text_strip(dataset['sentences']) cleaned_summaries = preprocess_utils.text_strip(dataset['summaries']) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") else: print("-------Preprocessed data exists. Now splitting dataset.-------") print("-------Now splitting dataset.-------") preprocess_utils.split_dataset(TRAIN_FILE_PATH, TUNE_FILE_PATH, VALID_FILE_PATH, PREPROCESSED_FILE_PATH, num_of_tuning_sam, num_of_valid_sam, whether_shuffle_entire_set=False, whether_shuffle_individual_file=True)
def main(args): """Preprocess the Microsoft text summarization dataset. Args: args: command line arguments. """ data_dir = args.raw_data_dir if not os.path.isdir(os.path.expanduser(data_dir)): raise Exception("Data directory not found.") num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise Exception("Number of samples must be non-negative integers") train_data_file = data_dir + "/train.tsv" train_sentences, train_summaries, train_grammar, train_meaning = __process_file( train_data_file) test_data_file = data_dir + "/test.tsv" test_sentences, test_summaries, test_grammar, test_meaning = __process_file( test_data_file) valid_data_file = data_dir + "/valid.tsv" valid_sentences, valid_summaries, valid_grammar, valid_meaning = __process_file( valid_data_file) tot_sentences = train_sentences + test_sentences + valid_sentences tot_summaries = train_summaries + test_summaries + valid_summaries tot_grammar = train_grammar + test_grammar + valid_grammar tot_meaning = train_meaning + test_meaning + valid_meaning cleaned_sentences = preprocess_utils.text_strip(tot_sentences) cleaned_summaries = preprocess_utils.text_strip(tot_summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([ spaced_sentences[i], spaced_summaries[i], tot_grammar[i], tot_meaning[i] ]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") print("-------Now splitting dataset.-------") if num_of_tuning_sam + num_of_valid_sam > len(spaced_sentences): raise Exception( "The number of tuning and validation samples together exceeds the total sample size of " + str(len(sentences))) sentence_shuffled = [] summary_shuffled = [] grammar_shuffled = [] meaning_shuffled = [] tune_shuffled = list(range(num_of_tuning_sam)) random.shuffle(tune_shuffled) valid_shuffled = list( range(num_of_tuning_sam, num_of_tuning_sam + num_of_valid_sam)) random.shuffle(valid_shuffled) train_shuffled = list( range(num_of_tuning_sam + num_of_valid_sam, len(spaced_sentences))) random.shuffle(train_shuffled) index_shuffled = tune_shuffled + valid_shuffled + train_shuffled for i in index_shuffled: sentence_shuffled.append(spaced_sentences[i]) summary_shuffled.append(spaced_summaries[i]) grammar_shuffled.append(tot_grammar[i]) meaning_shuffled.append(tot_meaning[i]) tuning_range = range(num_of_tuning_sam) valid_range = range(num_of_tuning_sam, num_of_tuning_sam + num_of_valid_sam) training_range = range(num_of_tuning_sam + num_of_valid_sam, len(summary_shuffled)) output_for_grammar_files = [summary_shuffled, grammar_shuffled] __write_to_file(TUNE_FILE_PATH_GRAMMAR, tuning_range, output_for_grammar_files) __write_to_file(VALID_FILE_PATH_GRAMMAR, valid_range, output_for_grammar_files) __write_to_file(TRAIN_FILE_PATH_GRAMMAR, training_range, output_for_grammar_files) output_for_meaning_files = [ sentence_shuffled, summary_shuffled, meaning_shuffled ] __write_to_file(TUNE_FILE_PATH_MEANING, tuning_range, output_for_meaning_files) __write_to_file(VALID_FILE_PATH_MEANING, valid_range, output_for_meaning_files) __write_to_file(TRAIN_FILE_PATH_MEANING, training_range, output_for_meaning_files)
def main(args): """Preprocess the Microsoft text summarization dataset. Args: args: command line arguments. Raises: ValueError when the number of samples is negative. """ data_dir = args.raw_data_dir if not os.path.isdir(os.path.expanduser(data_dir)): raise Exception("Data directory not found.") num_of_tuning_sam = args.num_of_tuning num_of_valid_sam = args.num_of_validation if num_of_valid_sam < 0 or num_of_tuning_sam < 0: raise ValueError("Number of samples must be non-negative integers") if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)): train_data_file = data_dir + "/train.tsv" train_sentences, train_summaries, train_ratings, train_excluded = __process_file( train_data_file) test_data_file = data_dir + "/test.tsv" test_sentences, test_summaries, test_ratings, test_excluded = __process_file( test_data_file) valid_data_file = data_dir + "/valid.tsv" valid_sentences, valid_summaries, valid_ratings, valid_excluded = __process_file( valid_data_file) tot_sentences = train_sentences + test_sentences + valid_sentences tot_summaries = train_summaries + test_summaries + valid_summaries tot_ratings = train_ratings + test_ratings + valid_ratings tot_excluded = train_excluded + test_excluded + valid_excluded cleaned_sentences = preprocess_utils.text_strip(tot_sentences) cleaned_summaries = preprocess_utils.text_strip(tot_summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) print("Total number of excluded sample is", tot_excluded) preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries) spaced_sentences = preprocess_utils.tokenize_with_space( cleaned_sentences) spaced_summaries = preprocess_utils.tokenize_with_space( cleaned_summaries) with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for i in range(len(spaced_sentences)): tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]]) print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH, "-------") else: print("-------Preprocessed data exists. Now splitting dataset.-------") print("-------Now splitting dataset.-------") preprocess_utils.split_dataset(TRAIN_FILE_PATH, TUNE_FILE_PATH, VALID_FILE_PATH, PREPROCESSED_FILE_PATH, num_of_tuning_sam, num_of_valid_sam, whether_shuffle_entire_set=False, whether_shuffle_individual_file=True)
def __format_data(): """ Format the dataset and clean up special characters. Returns: cleaned_sentences: a list of cleaned input sentences cleaned_summaries: a list of cleaned summaries corresponding to the input sentences """ print("-------Processing original sentences-------") for i in range(1, 11): subprocess.call('cat sent-comp.train' + str(i).zfill(2) + '.json | grep \'"sentence":\' > ~/' + TEMP_FOLDER_NAME + '/train' + str(i) + '.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) subprocess.call('cat comp-data.eval.json | grep \'"sentence":\' > ~/' + TEMP_FOLDER_NAME + '/train11.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) sentences = [] for i in range(1, 12): file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str( i) + '.txt' f = open(file_name, "r") odd_line = True for line in f: if odd_line: sentences.append(line[17:-3]) odd_line = not odd_line f.close() cleaned_sentences = preprocess_utils.text_strip(sentences) print("-------Processing summaries-------") for i in range(1, 11): subprocess.call('cat sent-comp.train' + str(i).zfill(2) + '.json | grep \'"headline":\' > ~/' + TEMP_FOLDER_NAME + '/train' + str(i) + '.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) subprocess.call('cat comp-data.eval.json | grep \'"headline":\' > ~/' + TEMP_FOLDER_NAME + '/train11.txt', shell=True, cwd=os.path.expanduser(DATASET_DIR)) summaries = [] for i in range(1, 12): file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str( i) + '.txt' f = open(file_name, "r") for line in f: summaries.append(line[15:-3]) f.close() cleaned_summaries = preprocess_utils.text_strip(summaries) cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry( cleaned_sentences, cleaned_summaries) preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries) print("Number of samples is", len(cleaned_sentences)) return cleaned_sentences, cleaned_summaries