def n_grams_train(name: str, file_name: Optional[str] = None, clean_data: Optional[pd.DataFrame] = None, n_grams: int = default_n_grams, fill_in_blank: bool = False) -> NGramsModel: """ n-grams training get a dictionary of grams to a dictionary of subsequent words and their counts """ if file_name is None and clean_data is None: raise ValueError('no file name or tokens provided') # get training data if clean_data is None: file_path = file_path_relative(f'{clean_data_folder}/{file_name}') logger.info(f'reading data from {file_path}') clean_data = pd.read_csv(file_path, converters={ sentences_key: literal_eval}) tokens: List[List[str]] = clean_data[sentences_key] average_sentence_len = np.average([len(sentence) for sentence in tokens]) if average_sentence_len < n_grams: raise ValueError( f'n-grams of {n_grams} is greater than average sentence ' + f'length of {average_sentence_len} in training data') if fill_in_blank and n_grams > 1: n_grams -= 1 # create n-gram model n_grams_res = NGramsModel(n_grams) # train model with sliding window for sentence in tokens: for i in range(len(sentence) - n_grams): sequence = ' '.join(sentence[i: i + n_grams]) if sequence not in n_grams_res.model: n_grams_res.model[sequence] = NGramsSequence(sequence) next_word = sentence[i + n_grams] n_grams_res.model[sequence].add_grams(next_word) # create aggregate objects n_grams_res.generate_aggregates() # save to disk json_file_path = file_path_relative(f'{models_folder}/{name}.json') with open(json_file_path, 'w', encoding='utf-8') as file: json.dump(n_grams_res, file, ensure_ascii=False, indent=4, sort_keys=True) return n_grams_res
def read_data_attention( strategy: tf.distribute.TPUStrategy, max_len: int, ) -> Tuple[np.array, np.array, np.array, np.array, tf.data.Dataset, tf.data.Dataset, tf.data.Dataset, int]: """ read data from attention models """ logger.info('reading data for attention models') # batch with number of tpu's batch_size = 16 * strategy.num_replicas_in_sync auto = tf.data.experimental.AUTOTUNE # First load the tokenizer tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-multilingual-cased') train = pd.read_csv(file_path_relative('jigsaw-toxic-comment-train.csv')) valid = pd.read_csv(file_path_relative('validation.csv')) test = pd.read_csv(file_path_relative('test.csv')) x_train = _run_encode(train['comment_text'].astype(str), tokenizer, maxlen=max_len) x_valid = _run_encode(valid['comment_text'].astype(str), tokenizer, maxlen=max_len) x_test = _run_encode(test['content'].astype(str), tokenizer, maxlen=max_len) y_train = train['toxic'].values y_valid = valid['toxic'].values train_dataset = (tf.data.Dataset.from_tensor_slices( (x_train, y_train)).repeat().shuffle(2048).batch(batch_size).prefetch(auto)) valid_dataset = (tf.data.Dataset.from_tensor_slices( (x_valid, y_valid)).batch(batch_size).cache().prefetch(auto)) test_dataset = ( tf.data.Dataset.from_tensor_slices(x_test).batch(batch_size)) # return all datasets return x_train, x_valid, y_train, y_valid, train_dataset, valid_dataset, \ test_dataset, batch_size
def build_embeddings(embedding_size_y: int, word_indexes: Dict[str, int]) -> np.array: """ build embeddings to be used with basic models """ logger.info('build glove embeddings') embeddings_indexes: Dict[str, np.array] = {} # open glove 840 file with open(file_path_relative(f'glove.840B.{embedding_size_y}d.txt', base_folder=default_base_folder if not IN_KAGGLE else 'glove840b300dtxt'), encoding='utf-8') as glove_file: for line in tqdm(glove_file): words = line.split(' ') word = words[0] coefficients = np.asarray([float(val) for val in words[1:]]) embeddings_indexes[word] = coefficients logger.info(f'Found {len(embeddings_indexes)} word vectors.') embedding_size_x: int = len(word_indexes) + 1 embeddings_output = np.zeros((embedding_size_x, embedding_size_y)) for word, i in tqdm(word_indexes.items()): word_embedding = embeddings_indexes.get(word) if word_embedding is not None: embeddings_output[i] = word_embedding return embeddings_output
def read_data( ) -> Tuple[np.array, np.array, np.array, np.array, int, Dict[str, int]]: """ read data from raw data, convert to dataframes """ logger.info('reading data') train = pd.read_csv(file_path_relative('jigsaw-toxic-comment-train.csv')) # drop unused columns train.drop( ['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True) # only use first n rows train = train.loc[:NUM_ROWS_TRAIN, :] logger.info(f'shape of training data: {train.shape}') max_len = train['comment_text'].apply(lambda x: len(str(x).split())).max() logger.info(f'max len: {max_len}') # train test split x_train, x_valid, y_train, y_valid = train_test_split( train['comment_text'].values, train['toxic'].values, stratify=train['toxic'].values, test_size=TEST_RATIO, shuffle=True) tokens = tf.keras.preprocessing.text.Tokenizer(num_words=None) all_data: List[str] = list(x_train) all_data.extend(list(x_valid)) tokens.fit_on_texts(all_data) x_train_sequences = tokens.texts_to_sequences(x_train) x_valid_sequences = tokens.texts_to_sequences(x_valid) # pad the data with zeros x_train_padded = tf.keras.preprocessing.sequence.pad_sequences( x_train_sequences, maxlen=max_len) x_valid_padded = tf.keras.preprocessing.sequence.pad_sequences( x_valid_sequences, maxlen=max_len) word_indexes = tokens.word_index return x_train_padded, x_valid_padded, y_train, y_valid, max_len, word_indexes
def _plot_attention(attention, sentence, predicted_sentence, name: str): """ plot the attention weights """ fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(1, 1, 1) ax.matshow(attention, cmap='viridis') fontdict = {'fontsize': 14} ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90) ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict) ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) if IN_NOTEBOOK: plt.show() else: file_path = file_path_relative(f'{output_folder}/attention_{name}.jpg') plt.savefig(file_path)
def _plot_train_val_loss(training_loss: List[float], model_name: str) -> None: """ plots the training and validation loss given history """ plt.figure() num_epochs = len(training_loss) nums = range(1, num_epochs + 1) plt.plot(nums, training_loss, label="train") plt.title(f"{model_name} Training and Validation Loss") plt.xlabel("Epoch") plt.ylabel("Loss") plt.legend() if IN_NOTEBOOK: plt.show() else: file_path = file_path_relative( f'{output_folder}/{model_name}.jpg') plt.savefig(file_path)
def on_epoch_end(self, epoch, logs=None): """ runs on end of each epoch """ if logs is None: return self.logs.append(logs) self.losses.append(logs.get('loss')) self.accuracy.append(logs.get('accuracy')) if len(self.losses) > 1: nums = np.arange(0, len(self.losses)) plt.style.use("seaborn") plt.figure() plt.plot(nums, self.losses, label="train loss") plt.plot(nums, self.accuracy, label="train accuracy") plt.title(f"Training Loss and Accuracy for Epoch {epoch}") plt.xlabel("Epoch #") plt.ylabel("Loss & Accuracy") plt.legend() file_path = file_path_relative( f'{output_folder}/rnn_{self.name}_train_epoch_{epoch}.png') plt.savefig(file_path)
def n_grams_predict_next(name: str, file_name: Optional[str] = None, model: Dict[str, NGramsSequence] = None, clean_input_file: Optional[str] = None, clean_input_data: Optional[pd.DataFrame] = None, num_lines_predict: Optional[int] = None, n_grams: int = default_n_grams, num_predict: int = 1, smoothing: SmoothingType = SmoothingType.basic) -> None: """ predict the next word(s) in the set """ logger.success(f'predicting with {smoothing.name} for {name}') if file_name is None and model is None: raise ValueError('no file name or model provided') if clean_input_file is None and clean_input_data is None: raise ValueError('no input file name or data provided') # create n-gram model if not provided if model is None: json_file_path = file_path_relative(f'{models_folder}/{file_name}') logger.info(f'reading data from {json_file_path}') with open(json_file_path, 'r') as file: model = NGramsModel.from_json(json.load(file)) # get testing data if clean_input_data is None: file_path = file_path_relative( f'{clean_data_folder}/{clean_input_file}') logger.info(f'reading data from {file_path}') clean_input_data = pd.read_csv(file_path, converters={ sentences_key: literal_eval}) predict_sentences: List[List[str]] = clean_input_data[sentences_key] if num_lines_predict is not None: predict_sentences = predict_sentences[:num_lines_predict] check_probability_smoothing: List[SmoothingType] = [SmoothingType.basic] logger.success('[[<words>]] = predicted words:') sum_probability_log: float = 0. count_all_predict: int = 0 # iterate over testing data for i, sentence in enumerate(predict_sentences): full_sentence = sentence.copy() for _ in range(num_predict): last_words = full_sentence[-n_grams:] sequence = ' '.join(last_words) probabilities = model.get_probabilities( sequence, smoothing) sum_probability = sum(elem[1] for elem in probabilities) # logger.info(f'probabilities: sum: {sum_probability}, all: {probabilities}') if smoothing in check_probability_smoothing: # for not-unseen outputs, check to # make sure sum is approximately 1 assert np.isclose( sum_probability, 1), f'probability of {sum_probability} is not close to 1' current_output, prob = probabilities[0] full_sentence.append(current_output) # if not unseen, add to perplexity calculation if current_output != unseen_output: sum_probability_log += np.log(prob) count_all_predict += 1 logger.info( f"{i + 1}. {' '.join(sentence)} [[{' '.join(full_sentence[len(sentence):])}]]") if count_all_predict == 0: logger.info('no predictions, no perplexity') else: total_loss = -1 * sum_probability_log perplexity: float = np.exp(total_loss / count_all_predict) logger.info(f"perplexity: {perplexity}")
def rnn_predict_next( name: str, text_vectorization_model: tf.keras.models.Sequential = None, clean_input_file: Optional[str] = None, clean_input_data: Optional[pd.DataFrame] = None, num_lines_predict: Optional[int] = None, num_predict: int = 1) -> None: """ predict next word(s) with given input """ logger.success(f'running predictions for {name}') if clean_input_file is None and clean_input_data is None: raise ValueError('no input file name or data provided') # create model from disk model = build_model(1) rnn_filepath = file_path_relative(f'{rnn_folder}/{name}/{rnn_file_name}') model.load_weights(rnn_filepath) model.build(tf.TensorShape([1, None])) model.summary() # get text vectorizer if text_vectorization_model is None: text_vectorization_filepath = file_path_relative( f'{models_folder}/{name}/vectorization') text_vectorization_model = tf.keras.models.load_model( text_vectorization_filepath) # get testing data if clean_input_data is None: file_path = file_path_relative( f'{clean_data_folder}/{clean_input_file}') logger.info(f'reading data from {file_path}') clean_input_data = pd.read_csv( file_path, converters={sentences_key: literal_eval}) predict_sentences: List[List[str]] = clean_input_data[sentences_key] if num_lines_predict is not None: predict_sentences = predict_sentences[:num_lines_predict] # vectorize testing data vectorize_layer: TextVectorization = text_vectorization_model.layers[0] vocabulary = vectorize_layer.get_vocabulary() # logger.info(f'vocabulary: {vocabulary}') # reset model, get ready for predict model.reset_states() logger.success('[[<words>]] = predicted words:') sum_probability_log: float = 0. count_all_predict: int = 0 # iterate over all input sentences for i, sentence in enumerate(predict_sentences): full_sentence = sentence.copy() for _ in range(num_predict): vectorized_sentence = flatten_input( text_vectorization_model.predict(full_sentence[-window_size:], batch_size=batch_size)) input_eval = tf.expand_dims(vectorized_sentence, 0) predictions = model.predict(input_eval) # remove batch dimension, get probabilities of last word probabilities = tf.squeeze(predictions, 0)[-1] # get the index of the prediction based on the max probability predicted_index = np.argmax(probabilities) predicted_word = vocabulary[predicted_index] full_sentence.append(predicted_word) sum_probability_log += np.log(probabilities[predicted_index]) count_all_predict += 1 logger.info( f"{i + 1}. {' '.join(sentence)} [[{' '.join(full_sentence[len(sentence):])}]]" ) if count_all_predict == 0: logger.info('no predictions, no perplexity') else: total_loss = -1 * sum_probability_log perplexity: float = np.exp(total_loss / count_all_predict) logger.info(f"perplexity: {perplexity}")
def rnn_train( name: str, file_name: Optional[str] = None, clean_data: Optional[pd.DataFrame] = None ) -> tf.keras.models.Sequential: """ rnn training creates the tensorflow rnn model for word prediction """ logger.info(f'run rnn training for {name}') if file_name is None and clean_data is None: raise ValueError('no file name or tokens provided') # get training data if clean_data is None: file_path = file_path_relative(f'{clean_data_folder}/{file_name}') logger.info(f'reading data from {file_path}') clean_data = pd.read_csv(file_path, converters={sentences_key: literal_eval}) tokens: List[List[str]] = clean_data[sentences_key] flattened_tokens: List[str] = flatten_input(tokens) dataset_all_tokens = tf.data.Dataset.from_tensor_slices(flattened_tokens) logger.success('created all tokens text dataset') # get text vectorization model text_vectorization_filepath = file_path_relative( f'{models_folder}/{name}/{text_vectorization_folder}') text_vectorization_model = create_text_vectorization_model( text_vectorization_filepath, dataset_all_tokens) vectorized_tokens: List[int] = flatten_input( text_vectorization_model.predict(flattened_tokens, batch_size=batch_size)) # create vectorized dataset vectorized_tokens_dataset = tf.data.Dataset.from_tensor_slices( vectorized_tokens) # create sliding window batched_vectorized_tokens = vectorized_tokens_dataset.batch( window_size + 1, drop_remainder=True) def split_train_test(batch: List[int]): input_text = batch[:-1] target_text = batch[1:] return input_text, target_text # create train and test training_dataset = batched_vectorized_tokens.map(split_train_test) # print some samples logger.success('training data sample:') for input_example, target_example in training_dataset.take(20): logger.info(f"\ninput: {input_example}\ntarget: {target_example}") # buffer size is used to shuffle the dataset buffer_size = 10000 # create batches training_dataset = training_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) logger.info(f'training dataset shape: {training_dataset}') model = build_model() # use sequence loss in training def loss(targets, logits): """ return loss for given iteration """ return tfa.seq2seq.sequence_loss(logits, targets, tf.ones([batch_size, window_size])) # use adam optimizer optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) logger.success('model compiled') rnn_filepath = file_path_relative(f'{rnn_folder}/{name}/{rnn_file_name}') # save checkpoints to disk checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=rnn_filepath, save_weights_only=True) # create visualizations plot_callback = PlotTrain(name) history = model.fit(training_dataset, epochs=epochs, callbacks=[checkpoint_callback, plot_callback]) model.summary() last_loss = plot_callback.losses[-1] logger.info(f'model loss: {last_loss}') return text_vectorization_model
def main() -> None: """ main entry point for program """ strategy = initialize() dataset_size: int = 30000 input_tensor_train, target_tensor_train, input_language, target_language, max_length_target, max_length_input, input_vals, target_vals = read_data( dataset_size) BUFFER_SIZE = len(input_tensor_train) BATCH_SIZE = 64 * strategy.num_replicas_in_sync EPOCHS = 15 steps_per_epoch = len(input_tensor_train) // BATCH_SIZE embedding_dim = 256 units = 1024 vocab_input_size = len(input_language.word_index) + 1 vocab_target_size = len(target_language.word_index) + 1 model_name: str = 'model_1' checkpoint_dir = file_path_relative(f'{model_folder}/{model_name}') with strategy.scope(): optimizer = tf.keras.optimizers.Adam() encoder = Encoder(vocab_input_size, embedding_dim, units, BATCH_SIZE) decoder = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE) checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) run_train(input_tensor_train, target_tensor_train, target_language, checkpoint, checkpoint_dir, encoder, optimizer, decoder, steps_per_epoch, BUFFER_SIZE, BATCH_SIZE, EPOCHS, model_name) # restoring the latest checkpoint in checkpoint_dir checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) # run tests and get score run_tests(max_length_target, max_length_input, input_language, target_language, units, encoder, decoder, input_vals, target_vals, model_name) # second model embedding_dim = 512 units = 2048 model_name: str = 'model_2' checkpoint_dir = file_path_relative(f'{model_folder}/{model_name}') with strategy.scope(): optimizer_2 = tf.keras.optimizers.Adam() encoder_2 = Encoder(vocab_input_size, embedding_dim, units, BATCH_SIZE, gru=True) decoder_2 = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE, gru=True) checkpoint_2 = tf.train.Checkpoint(optimizer=optimizer_2, encoder=encoder_2, decoder=decoder_2) run_train(input_tensor_train, target_tensor_train, target_language, checkpoint_2, checkpoint_dir, encoder_2, optimizer_2, decoder_2, steps_per_epoch, BUFFER_SIZE, BATCH_SIZE, EPOCHS, model_name) # restoring the latest checkpoint in checkpoint_dir checkpoint_2.restore(tf.train.latest_checkpoint(checkpoint_dir)) # run tests and get score run_tests(max_length_target, max_length_input, input_language, target_language, units, encoder_2, decoder_2, input_vals, target_vals, model_name)
def cnn_train(name: str, clean_data: pd.DataFrame) -> tf.keras.models.Sequential: """ cnn training creates the tensorflow cnn model for word prediction """ logger.info(f'run cnn training for {name}') all_paragraphs: List[List[str]] = clean_data[paragraph_key] all_sentences: List[str] = flatten_input(all_paragraphs) all_tokens: List[str] = flatten_input( [get_tokens(sentence) for sentence in all_sentences]) dataset_all_tokens = tf.data.Dataset.from_tensor_slices(all_tokens) logger.success('created all tokens text dataset') # get text vectorization model text_vectorization_filepath = file_path_relative( f'{text_vectorization_folder}/{name}') text_vectorization_model = create_text_vectorization_model( text_vectorization_filepath, dataset_all_tokens) logger.info('get vectorized tokens') vectorized_paragraphs_file = file_path_relative( f'{clean_data_folder}/documents_vectorized.yml') vectorized_paragraphs: Optional[List[List[int]]] = None if exists(vectorized_paragraphs_file): logger.info('found vectorized paragraphs file') with open(vectorized_paragraphs_file, 'r') as yaml_file: vectorized_paragraphs = yaml.load(yaml_file, Loader=yaml.FullLoader) else: vectorized_paragraphs = [ flatten_input( text_vectorization_model.predict( get_tokens(' '.join(paragraph)))) for paragraph in all_paragraphs ] with open(vectorized_paragraphs_file, 'w') as yaml_file: yaml.dump(vectorized_paragraphs, yaml_file) # labels: List[int] = np.vstack(clean_data[class_key].to_numpy()) labels: List[int] = clean_data[class_key].to_numpy() print(labels.shape) # create dataset length_vectorized_list = len(max(vectorized_paragraphs, key=len)) vectorized_tokens_rectangular = [ pad_zeros(paragraph, length_vectorized_list) for paragraph in vectorized_paragraphs ] complete_dataset = tf.data.Dataset.from_tensor_slices( (vectorized_tokens_rectangular, labels)) logger.info('created complete dataset') # buffer size is used to shuffle the dataset buffer_size = 10000 training_dataset = complete_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) logger.info('batched dataset') # print some samples logger.success('training data sample:') for input_example, target_example in training_dataset.take(1): logger.info(f"\ninput: {input_example}\ntarget: {target_example}") logger.info(f'training dataset shape: {training_dataset}') model = build_cnn_model(length_vectorized_list) model.compile( loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['accuracy']) logger.success('model compiled') cnn_filepath = file_path_relative(f'{cnn_folder}/{name}/{cnn_file_name}') # save checkpoints to disk checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=cnn_filepath, save_weights_only=True) # create visualizations _history = model.fit(training_dataset, epochs=epochs, callbacks=[checkpoint_callback]) model.summary() return text_vectorization_model
def clean( clean_data_basename: Optional[str] = default_file_name ) -> Tuple[pd.DataFrame, List[BookType]]: """ data cleaning """ class_count: int = 0 label_list: List[BookType] = [] get_from_disk = clean_data_basename is not None if not get_from_disk: clean_data_basename = default_file_name clean_data_path = file_path_relative(clean_data_basename) classes_path = file_path_relative(classes_file_name) if get_from_disk and exists(clean_data_path) and exists(classes_path): logger.info(f'reading data from {clean_data_path}') data = pd.read_csv(clean_data_path, converters={paragraph_key: literal_eval}) label_list_enum: Optional[List[BookType]] = None with open(classes_path) as classes_file: label_list = yaml.load(classes_file, Loader=yaml.FullLoader) label_list_enum = [BookType(elem) for elem in label_list] return data, label_list_enum data: pd.DataFrame = pd.DataFrame() # preprocess data and construct examples found_files: bool = False for file_path in get_glob(f'{part_1_data_folder}/*.txt'): found_files = True file_name: str = basename(splitext(file_path)[0]) logger.info(f'processing {file_name}') title: Optional[str] = None book_key: Optional[BookType] = None book_started: bool = False paragraphs: List[List[str]] = [] num_newline_count: int = 0 line_number: int = 0 with open(file_path, 'r') as current_file: while True: line = current_file.readline() line_number += 1 line_trim: Optional[str] = None if line: line_trim = line.strip() if not book_started and \ ((line_trim is not None and line_trim.startswith(start_book)) or (book_key is not None and line_number >= start_end_map[book_key].start)): book_started = True if line_trim is None or line_trim.startswith(end_book) \ or line_trim == the_end or \ (book_key is not None and line_number >= start_end_map[book_key].end): # done with reading the file break if not book_started: if title is None and line_trim.startswith(title_split): title = line_trim.split(title_split)[1] logger.info(f'title: {title}') if book_key is None and line_trim.startswith(author_split): author: str = line_trim.split(author_split)[1] logger.info(f'author: {author}') book_key = BookType(author.split(' ')[-1]) else: if len(line_trim) < min_line_len or \ line.startswith(chapter) or line.startswith(chapter): num_newline_count += 1 else: multi_line_quotes = line_trim.startswith(multi_quote_identifier) \ and paragraphs[-1][0].startswith(multi_quote_identifier) if len(paragraphs) == 0 or \ (num_newline_count > 0 and not multi_line_quotes): paragraphs.append([]) num_newline_count = 0 paragraphs[-1].append(line_trim) if not found_files: raise RuntimeError('no files found') if book_key is None: raise RuntimeError('no book key found') class_name = class_map[book_key] logger.info( f'number of paragraphs in class "{class_name}": {len(paragraphs)}') paragraphs = [[normalize_sentence(sentence) for sentence in paragraph] for paragraph in paragraphs] data = pd.concat([ data, pd.DataFrame({ paragraph_key: paragraphs, label_key: [class_name] * len(paragraphs), class_key: class_count }) ], ignore_index=True) label_list.append(book_key) class_count += 1 data.to_csv(clean_data_path, index=False) with open(classes_path, 'w') as classes_file: label_list_str = [elem.name for elem in label_list] yaml.dump(label_list_str, classes_file) return data, label_list
def clean( clean_data_basename: Optional[str] = default_file_name ) -> pd.DataFrame: """ data cleaning """ data: pd.DataFrame = pd.DataFrame() get_from_disk = clean_data_basename is not None if not get_from_disk: clean_data_basename = default_file_name clean_data_path = file_path_relative(clean_data_basename) if get_from_disk and exists(clean_data_path): logger.info(f'reading data from {clean_data_path}') data = pd.read_csv(clean_data_path) return data data: pd.DataFrame = pd.DataFrame() # iterate over the files for class_val, file_path in enumerate([ file_path_relative(f'{part_2_data_folder}/negative.review'), file_path_relative(f'{part_2_data_folder}/positive.review') ]): root: Optional[etree._Element] = None with open(file_path, 'rb') as current_file: parser = etree.XMLParser(recover=True) # parse the xml root = etree.fromstring( f'<?xml version="1.0"?><root_elem>{current_file.read()}</root_elem>', parser=parser) reviews: List[str] = [] # find all of the review_text tags recursively for elem in root.findall('.//review_text'): cast_elem: etree._Element = cast(etree._Element, elem) decoded_text: str = literal_eval(f"'{cast_elem.text}'") trimmed_text = decoded_text.strip() reviews.append(trimmed_text) class_name: str = reviews_class_map[class_val] logger.info( f'number of reviews in class "{class_name}": {len(reviews)}') # create dataframe data = pd.concat([ data, pd.DataFrame({ review_key: reviews, label_key: class_name, class_key: class_val }) ], ignore_index=True) data.to_csv(clean_data_path, index=False) return data