Пример #1
0
def n_grams_train(name: str, file_name: Optional[str] = None,
                  clean_data: Optional[pd.DataFrame] = None,
                  n_grams: int = default_n_grams,
                  fill_in_blank: bool = False) -> NGramsModel:
    """
    n-grams training
    get a dictionary of grams to a dictionary of subsequent words and their counts
    """
    if file_name is None and clean_data is None:
        raise ValueError('no file name or tokens provided')

    # get training data
    if clean_data is None:
        file_path = file_path_relative(f'{clean_data_folder}/{file_name}')
        logger.info(f'reading data from {file_path}')
        clean_data = pd.read_csv(file_path, converters={
            sentences_key: literal_eval})

    tokens: List[List[str]] = clean_data[sentences_key]
    average_sentence_len = np.average([len(sentence) for sentence in tokens])
    if average_sentence_len < n_grams:
        raise ValueError(
            f'n-grams of {n_grams} is greater than average sentence ' +
            f'length of {average_sentence_len} in training data')

    if fill_in_blank and n_grams > 1:
        n_grams -= 1

    # create n-gram model
    n_grams_res = NGramsModel(n_grams)

    # train model with sliding window
    for sentence in tokens:
        for i in range(len(sentence) - n_grams):
            sequence = ' '.join(sentence[i: i + n_grams])
            if sequence not in n_grams_res.model:
                n_grams_res.model[sequence] = NGramsSequence(sequence)

            next_word = sentence[i + n_grams]
            n_grams_res.model[sequence].add_grams(next_word)

    # create aggregate objects
    n_grams_res.generate_aggregates()

    # save to disk
    json_file_path = file_path_relative(f'{models_folder}/{name}.json')
    with open(json_file_path, 'w', encoding='utf-8') as file:
        json.dump(n_grams_res, file, ensure_ascii=False,
                  indent=4, sort_keys=True)

    return n_grams_res
Пример #2
0
def read_data_attention(
    strategy: tf.distribute.TPUStrategy,
    max_len: int,
) -> Tuple[np.array, np.array, np.array, np.array, tf.data.Dataset,
           tf.data.Dataset, tf.data.Dataset, int]:
    """
    read data from attention models
    """
    logger.info('reading data for attention models')

    # batch with number of tpu's
    batch_size = 16 * strategy.num_replicas_in_sync
    auto = tf.data.experimental.AUTOTUNE

    # First load the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained(
        'distilbert-base-multilingual-cased')

    train = pd.read_csv(file_path_relative('jigsaw-toxic-comment-train.csv'))
    valid = pd.read_csv(file_path_relative('validation.csv'))
    test = pd.read_csv(file_path_relative('test.csv'))

    x_train = _run_encode(train['comment_text'].astype(str),
                          tokenizer,
                          maxlen=max_len)
    x_valid = _run_encode(valid['comment_text'].astype(str),
                          tokenizer,
                          maxlen=max_len)
    x_test = _run_encode(test['content'].astype(str),
                         tokenizer,
                         maxlen=max_len)

    y_train = train['toxic'].values
    y_valid = valid['toxic'].values

    train_dataset = (tf.data.Dataset.from_tensor_slices(
        (x_train,
         y_train)).repeat().shuffle(2048).batch(batch_size).prefetch(auto))

    valid_dataset = (tf.data.Dataset.from_tensor_slices(
        (x_valid, y_valid)).batch(batch_size).cache().prefetch(auto))

    test_dataset = (
        tf.data.Dataset.from_tensor_slices(x_test).batch(batch_size))

    # return all datasets
    return x_train, x_valid, y_train, y_valid, train_dataset, valid_dataset, \
        test_dataset, batch_size
Пример #3
0
def build_embeddings(embedding_size_y: int,
                     word_indexes: Dict[str, int]) -> np.array:
    """
    build embeddings to be used with basic models
    """
    logger.info('build glove embeddings')

    embeddings_indexes: Dict[str, np.array] = {}
    # open glove 840 file
    with open(file_path_relative(f'glove.840B.{embedding_size_y}d.txt',
                                 base_folder=default_base_folder
                                 if not IN_KAGGLE else 'glove840b300dtxt'),
              encoding='utf-8') as glove_file:
        for line in tqdm(glove_file):
            words = line.split(' ')
            word = words[0]
            coefficients = np.asarray([float(val) for val in words[1:]])
            embeddings_indexes[word] = coefficients

    logger.info(f'Found {len(embeddings_indexes)} word vectors.')

    embedding_size_x: int = len(word_indexes) + 1

    embeddings_output = np.zeros((embedding_size_x, embedding_size_y))
    for word, i in tqdm(word_indexes.items()):
        word_embedding = embeddings_indexes.get(word)
        if word_embedding is not None:
            embeddings_output[i] = word_embedding

    return embeddings_output
Пример #4
0
def read_data(
) -> Tuple[np.array, np.array, np.array, np.array, int, Dict[str, int]]:
    """
    read data from raw data, convert to dataframes
    """
    logger.info('reading data')

    train = pd.read_csv(file_path_relative('jigsaw-toxic-comment-train.csv'))

    # drop unused columns
    train.drop(
        ['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        axis=1,
        inplace=True)

    # only use first n rows
    train = train.loc[:NUM_ROWS_TRAIN, :]
    logger.info(f'shape of training data: {train.shape}')

    max_len = train['comment_text'].apply(lambda x: len(str(x).split())).max()
    logger.info(f'max len: {max_len}')

    # train test split
    x_train, x_valid, y_train, y_valid = train_test_split(
        train['comment_text'].values,
        train['toxic'].values,
        stratify=train['toxic'].values,
        test_size=TEST_RATIO,
        shuffle=True)

    tokens = tf.keras.preprocessing.text.Tokenizer(num_words=None)

    all_data: List[str] = list(x_train)
    all_data.extend(list(x_valid))
    tokens.fit_on_texts(all_data)
    x_train_sequences = tokens.texts_to_sequences(x_train)
    x_valid_sequences = tokens.texts_to_sequences(x_valid)

    # pad the data with zeros
    x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(
        x_train_sequences, maxlen=max_len)
    x_valid_padded = tf.keras.preprocessing.sequence.pad_sequences(
        x_valid_sequences, maxlen=max_len)

    word_indexes = tokens.word_index

    return x_train_padded, x_valid_padded, y_train, y_valid, max_len, word_indexes
Пример #5
0
def _plot_attention(attention, sentence, predicted_sentence, name: str):
    """
    plot the attention weights
    """
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    if IN_NOTEBOOK:
        plt.show()
    else:
        file_path = file_path_relative(f'{output_folder}/attention_{name}.jpg')
        plt.savefig(file_path)
Пример #6
0
def _plot_train_val_loss(training_loss: List[float], model_name: str) -> None:
    """
    plots the training and validation loss given history
    """

    plt.figure()

    num_epochs = len(training_loss)
    nums = range(1, num_epochs + 1)

    plt.plot(nums, training_loss, label="train")
    plt.title(f"{model_name} Training and Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    if IN_NOTEBOOK:
        plt.show()
    else:
        file_path = file_path_relative(
            f'{output_folder}/{model_name}.jpg')
        plt.savefig(file_path)
Пример #7
0
    def on_epoch_end(self, epoch, logs=None):
        """
        runs on end of each epoch
        """
        if logs is None:
            return
        self.logs.append(logs)
        self.losses.append(logs.get('loss'))
        self.accuracy.append(logs.get('accuracy'))

        if len(self.losses) > 1:
            nums = np.arange(0, len(self.losses))
            plt.style.use("seaborn")

            plt.figure()
            plt.plot(nums, self.losses, label="train loss")
            plt.plot(nums, self.accuracy, label="train accuracy")
            plt.title(f"Training Loss and Accuracy for Epoch {epoch}")
            plt.xlabel("Epoch #")
            plt.ylabel("Loss & Accuracy")
            plt.legend()
            file_path = file_path_relative(
                f'{output_folder}/rnn_{self.name}_train_epoch_{epoch}.png')
            plt.savefig(file_path)
Пример #8
0
def n_grams_predict_next(name: str,
                         file_name: Optional[str] = None,
                         model: Dict[str, NGramsSequence] = None,
                         clean_input_file: Optional[str] = None,
                         clean_input_data: Optional[pd.DataFrame] = None,
                         num_lines_predict: Optional[int] = None,
                         n_grams: int = default_n_grams,
                         num_predict: int = 1,
                         smoothing: SmoothingType = SmoothingType.basic) -> None:
    """
    predict the next word(s) in the set
    """

    logger.success(f'predicting with {smoothing.name} for {name}')

    if file_name is None and model is None:
        raise ValueError('no file name or model provided')

    if clean_input_file is None and clean_input_data is None:
        raise ValueError('no input file name or data provided')

    # create n-gram model if not provided
    if model is None:
        json_file_path = file_path_relative(f'{models_folder}/{file_name}')
        logger.info(f'reading data from {json_file_path}')
        with open(json_file_path, 'r') as file:
            model = NGramsModel.from_json(json.load(file))

    # get testing data
    if clean_input_data is None:
        file_path = file_path_relative(
            f'{clean_data_folder}/{clean_input_file}')
        logger.info(f'reading data from {file_path}')
        clean_input_data = pd.read_csv(file_path, converters={
            sentences_key: literal_eval})

    predict_sentences: List[List[str]] = clean_input_data[sentences_key]
    if num_lines_predict is not None:
        predict_sentences = predict_sentences[:num_lines_predict]

    check_probability_smoothing: List[SmoothingType] = [SmoothingType.basic]

    logger.success('[[<words>]] = predicted words:')

    sum_probability_log: float = 0.
    count_all_predict: int = 0

    # iterate over testing data
    for i, sentence in enumerate(predict_sentences):
        full_sentence = sentence.copy()
        for _ in range(num_predict):
            last_words = full_sentence[-n_grams:]
            sequence = ' '.join(last_words)

            probabilities = model.get_probabilities(
                sequence, smoothing)
            sum_probability = sum(elem[1] for elem in probabilities)
            # logger.info(f'probabilities: sum: {sum_probability}, all: {probabilities}')
            if smoothing in check_probability_smoothing:
                # for not-unseen outputs, check to
                # make sure sum is approximately 1
                assert np.isclose(
                    sum_probability, 1), f'probability of {sum_probability} is not close to 1'

            current_output, prob = probabilities[0]
            full_sentence.append(current_output)
            # if not unseen, add to perplexity calculation
            if current_output != unseen_output:
                sum_probability_log += np.log(prob)
                count_all_predict += 1

        logger.info(
            f"{i + 1}. {' '.join(sentence)} [[{' '.join(full_sentence[len(sentence):])}]]")

    if count_all_predict == 0:
        logger.info('no predictions, no perplexity')
    else:
        total_loss = -1 * sum_probability_log
        perplexity: float = np.exp(total_loss / count_all_predict)
        logger.info(f"perplexity: {perplexity}")
Пример #9
0
def rnn_predict_next(
        name: str,
        text_vectorization_model: tf.keras.models.Sequential = None,
        clean_input_file: Optional[str] = None,
        clean_input_data: Optional[pd.DataFrame] = None,
        num_lines_predict: Optional[int] = None,
        num_predict: int = 1) -> None:
    """
    predict next word(s) with given input
    """

    logger.success(f'running predictions for {name}')

    if clean_input_file is None and clean_input_data is None:
        raise ValueError('no input file name or data provided')

    # create model from disk
    model = build_model(1)
    rnn_filepath = file_path_relative(f'{rnn_folder}/{name}/{rnn_file_name}')
    model.load_weights(rnn_filepath)
    model.build(tf.TensorShape([1, None]))
    model.summary()

    # get text vectorizer
    if text_vectorization_model is None:
        text_vectorization_filepath = file_path_relative(
            f'{models_folder}/{name}/vectorization')
        text_vectorization_model = tf.keras.models.load_model(
            text_vectorization_filepath)

    # get testing data
    if clean_input_data is None:
        file_path = file_path_relative(
            f'{clean_data_folder}/{clean_input_file}')
        logger.info(f'reading data from {file_path}')
        clean_input_data = pd.read_csv(
            file_path, converters={sentences_key: literal_eval})

    predict_sentences: List[List[str]] = clean_input_data[sentences_key]
    if num_lines_predict is not None:
        predict_sentences = predict_sentences[:num_lines_predict]

    # vectorize testing data
    vectorize_layer: TextVectorization = text_vectorization_model.layers[0]
    vocabulary = vectorize_layer.get_vocabulary()
    # logger.info(f'vocabulary: {vocabulary}')

    # reset model, get ready for predict
    model.reset_states()

    logger.success('[[<words>]] = predicted words:')

    sum_probability_log: float = 0.
    count_all_predict: int = 0

    # iterate over all input sentences
    for i, sentence in enumerate(predict_sentences):
        full_sentence = sentence.copy()
        for _ in range(num_predict):
            vectorized_sentence = flatten_input(
                text_vectorization_model.predict(full_sentence[-window_size:],
                                                 batch_size=batch_size))
            input_eval = tf.expand_dims(vectorized_sentence, 0)
            predictions = model.predict(input_eval)
            # remove batch dimension, get probabilities of last word
            probabilities = tf.squeeze(predictions, 0)[-1]

            # get the index of the prediction based on the max probability
            predicted_index = np.argmax(probabilities)

            predicted_word = vocabulary[predicted_index]
            full_sentence.append(predicted_word)

            sum_probability_log += np.log(probabilities[predicted_index])
            count_all_predict += 1

        logger.info(
            f"{i + 1}. {' '.join(sentence)} [[{' '.join(full_sentence[len(sentence):])}]]"
        )

    if count_all_predict == 0:
        logger.info('no predictions, no perplexity')
    else:
        total_loss = -1 * sum_probability_log
        perplexity: float = np.exp(total_loss / count_all_predict)
        logger.info(f"perplexity: {perplexity}")
Пример #10
0
def rnn_train(
        name: str,
        file_name: Optional[str] = None,
        clean_data: Optional[pd.DataFrame] = None
) -> tf.keras.models.Sequential:
    """
    rnn training
    creates the tensorflow rnn model for word prediction
    """
    logger.info(f'run rnn training for {name}')

    if file_name is None and clean_data is None:
        raise ValueError('no file name or tokens provided')

    # get training data
    if clean_data is None:
        file_path = file_path_relative(f'{clean_data_folder}/{file_name}')
        logger.info(f'reading data from {file_path}')
        clean_data = pd.read_csv(file_path,
                                 converters={sentences_key: literal_eval})

    tokens: List[List[str]] = clean_data[sentences_key]
    flattened_tokens: List[str] = flatten_input(tokens)
    dataset_all_tokens = tf.data.Dataset.from_tensor_slices(flattened_tokens)
    logger.success('created all tokens text dataset')

    # get text vectorization model
    text_vectorization_filepath = file_path_relative(
        f'{models_folder}/{name}/{text_vectorization_folder}')

    text_vectorization_model = create_text_vectorization_model(
        text_vectorization_filepath, dataset_all_tokens)
    vectorized_tokens: List[int] = flatten_input(
        text_vectorization_model.predict(flattened_tokens,
                                         batch_size=batch_size))

    # create vectorized dataset
    vectorized_tokens_dataset = tf.data.Dataset.from_tensor_slices(
        vectorized_tokens)
    # create sliding window
    batched_vectorized_tokens = vectorized_tokens_dataset.batch(
        window_size + 1, drop_remainder=True)

    def split_train_test(batch: List[int]):
        input_text = batch[:-1]
        target_text = batch[1:]
        return input_text, target_text

    # create train and test
    training_dataset = batched_vectorized_tokens.map(split_train_test)

    # print some samples
    logger.success('training data sample:')
    for input_example, target_example in training_dataset.take(20):
        logger.info(f"\ninput: {input_example}\ntarget: {target_example}")

    # buffer size is used to shuffle the dataset
    buffer_size = 10000
    # create batches
    training_dataset = training_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)
    logger.info(f'training dataset shape: {training_dataset}')

    model = build_model()

    # use sequence loss in training
    def loss(targets, logits):
        """
        return loss for given iteration
        """
        return tfa.seq2seq.sequence_loss(logits, targets,
                                         tf.ones([batch_size, window_size]))

    # use adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    logger.success('model compiled')

    rnn_filepath = file_path_relative(f'{rnn_folder}/{name}/{rnn_file_name}')

    # save checkpoints to disk
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=rnn_filepath, save_weights_only=True)

    # create visualizations
    plot_callback = PlotTrain(name)
    history = model.fit(training_dataset,
                        epochs=epochs,
                        callbacks=[checkpoint_callback, plot_callback])
    model.summary()
    last_loss = plot_callback.losses[-1]
    logger.info(f'model loss: {last_loss}')
    return text_vectorization_model
Пример #11
0
def main() -> None:
    """
    main entry point for program
    """
    strategy = initialize()

    dataset_size: int = 30000
    input_tensor_train, target_tensor_train, input_language, target_language, max_length_target, max_length_input, input_vals, target_vals = read_data(
        dataset_size)

    BUFFER_SIZE = len(input_tensor_train)
    BATCH_SIZE = 64 * strategy.num_replicas_in_sync
    EPOCHS = 15
    steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
    embedding_dim = 256
    units = 1024
    vocab_input_size = len(input_language.word_index) + 1
    vocab_target_size = len(target_language.word_index) + 1

    model_name: str = 'model_1'

    checkpoint_dir = file_path_relative(f'{model_folder}/{model_name}')

    with strategy.scope():
        optimizer = tf.keras.optimizers.Adam()
        encoder = Encoder(vocab_input_size, embedding_dim, units, BATCH_SIZE)
        decoder = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE)

        checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                         encoder=encoder,
                                         decoder=decoder)

        run_train(input_tensor_train, target_tensor_train, target_language,
                  checkpoint, checkpoint_dir, encoder, optimizer, decoder,
                  steps_per_epoch, BUFFER_SIZE, BATCH_SIZE, EPOCHS, model_name)

    # restoring the latest checkpoint in checkpoint_dir
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    # run tests and get score
    run_tests(max_length_target, max_length_input, input_language,
              target_language, units, encoder, decoder, input_vals,
              target_vals, model_name)

    # second model
    embedding_dim = 512
    units = 2048

    model_name: str = 'model_2'

    checkpoint_dir = file_path_relative(f'{model_folder}/{model_name}')

    with strategy.scope():
        optimizer_2 = tf.keras.optimizers.Adam()
        encoder_2 = Encoder(vocab_input_size,
                            embedding_dim,
                            units,
                            BATCH_SIZE,
                            gru=True)
        decoder_2 = Decoder(vocab_target_size,
                            embedding_dim,
                            units,
                            BATCH_SIZE,
                            gru=True)

        checkpoint_2 = tf.train.Checkpoint(optimizer=optimizer_2,
                                           encoder=encoder_2,
                                           decoder=decoder_2)

        run_train(input_tensor_train, target_tensor_train, target_language,
                  checkpoint_2, checkpoint_dir, encoder_2, optimizer_2,
                  decoder_2, steps_per_epoch, BUFFER_SIZE, BATCH_SIZE, EPOCHS,
                  model_name)

    # restoring the latest checkpoint in checkpoint_dir
    checkpoint_2.restore(tf.train.latest_checkpoint(checkpoint_dir))

    # run tests and get score
    run_tests(max_length_target, max_length_input, input_language,
              target_language, units, encoder_2, decoder_2, input_vals,
              target_vals, model_name)
Пример #12
0
def cnn_train(name: str,
              clean_data: pd.DataFrame) -> tf.keras.models.Sequential:
    """
    cnn training
    creates the tensorflow cnn model for word prediction
    """
    logger.info(f'run cnn training for {name}')

    all_paragraphs: List[List[str]] = clean_data[paragraph_key]
    all_sentences: List[str] = flatten_input(all_paragraphs)
    all_tokens: List[str] = flatten_input(
        [get_tokens(sentence) for sentence in all_sentences])
    dataset_all_tokens = tf.data.Dataset.from_tensor_slices(all_tokens)
    logger.success('created all tokens text dataset')

    # get text vectorization model
    text_vectorization_filepath = file_path_relative(
        f'{text_vectorization_folder}/{name}')

    text_vectorization_model = create_text_vectorization_model(
        text_vectorization_filepath, dataset_all_tokens)

    logger.info('get vectorized tokens')
    vectorized_paragraphs_file = file_path_relative(
        f'{clean_data_folder}/documents_vectorized.yml')
    vectorized_paragraphs: Optional[List[List[int]]] = None
    if exists(vectorized_paragraphs_file):
        logger.info('found vectorized paragraphs file')
        with open(vectorized_paragraphs_file, 'r') as yaml_file:
            vectorized_paragraphs = yaml.load(yaml_file,
                                              Loader=yaml.FullLoader)
    else:
        vectorized_paragraphs = [
            flatten_input(
                text_vectorization_model.predict(
                    get_tokens(' '.join(paragraph))))
            for paragraph in all_paragraphs
        ]
        with open(vectorized_paragraphs_file, 'w') as yaml_file:
            yaml.dump(vectorized_paragraphs, yaml_file)

    # labels: List[int] = np.vstack(clean_data[class_key].to_numpy())
    labels: List[int] = clean_data[class_key].to_numpy()
    print(labels.shape)

    # create dataset
    length_vectorized_list = len(max(vectorized_paragraphs, key=len))
    vectorized_tokens_rectangular = [
        pad_zeros(paragraph, length_vectorized_list)
        for paragraph in vectorized_paragraphs
    ]
    complete_dataset = tf.data.Dataset.from_tensor_slices(
        (vectorized_tokens_rectangular, labels))
    logger.info('created complete dataset')

    # buffer size is used to shuffle the dataset
    buffer_size = 10000
    training_dataset = complete_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)
    logger.info('batched dataset')

    # print some samples
    logger.success('training data sample:')
    for input_example, target_example in training_dataset.take(1):
        logger.info(f"\ninput: {input_example}\ntarget: {target_example}")

    logger.info(f'training dataset shape: {training_dataset}')

    model = build_cnn_model(length_vectorized_list)

    model.compile(
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
        optimizer=tf.keras.optimizers.Adam(1e-4),
        metrics=['accuracy'])
    logger.success('model compiled')

    cnn_filepath = file_path_relative(f'{cnn_folder}/{name}/{cnn_file_name}')

    # save checkpoints to disk
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=cnn_filepath, save_weights_only=True)

    # create visualizations
    _history = model.fit(training_dataset,
                         epochs=epochs,
                         callbacks=[checkpoint_callback])
    model.summary()
    return text_vectorization_model
Пример #13
0
def clean(
    clean_data_basename: Optional[str] = default_file_name
) -> Tuple[pd.DataFrame, List[BookType]]:
    """
    data cleaning
    """
    class_count: int = 0
    label_list: List[BookType] = []

    get_from_disk = clean_data_basename is not None

    if not get_from_disk:
        clean_data_basename = default_file_name

    clean_data_path = file_path_relative(clean_data_basename)
    classes_path = file_path_relative(classes_file_name)

    if get_from_disk and exists(clean_data_path) and exists(classes_path):
        logger.info(f'reading data from {clean_data_path}')
        data = pd.read_csv(clean_data_path,
                           converters={paragraph_key: literal_eval})
        label_list_enum: Optional[List[BookType]] = None
        with open(classes_path) as classes_file:
            label_list = yaml.load(classes_file, Loader=yaml.FullLoader)
            label_list_enum = [BookType(elem) for elem in label_list]
        return data, label_list_enum

    data: pd.DataFrame = pd.DataFrame()

    # preprocess data and construct examples
    found_files: bool = False
    for file_path in get_glob(f'{part_1_data_folder}/*.txt'):
        found_files = True
        file_name: str = basename(splitext(file_path)[0])
        logger.info(f'processing {file_name}')
        title: Optional[str] = None
        book_key: Optional[BookType] = None
        book_started: bool = False
        paragraphs: List[List[str]] = []
        num_newline_count: int = 0
        line_number: int = 0
        with open(file_path, 'r') as current_file:
            while True:
                line = current_file.readline()
                line_number += 1
                line_trim: Optional[str] = None
                if line:
                    line_trim = line.strip()
                if not book_started and \
                    ((line_trim is not None and line_trim.startswith(start_book))
                     or (book_key is not None and line_number >= start_end_map[book_key].start)):
                    book_started = True
                if line_trim is None or line_trim.startswith(end_book) \
                        or line_trim == the_end or \
                        (book_key is not None and line_number >= start_end_map[book_key].end):
                    # done with reading the file
                    break
                if not book_started:
                    if title is None and line_trim.startswith(title_split):
                        title = line_trim.split(title_split)[1]
                        logger.info(f'title: {title}')
                    if book_key is None and line_trim.startswith(author_split):
                        author: str = line_trim.split(author_split)[1]
                        logger.info(f'author: {author}')
                        book_key = BookType(author.split(' ')[-1])
                else:
                    if len(line_trim) < min_line_len or \
                            line.startswith(chapter) or line.startswith(chapter):
                        num_newline_count += 1
                    else:
                        multi_line_quotes = line_trim.startswith(multi_quote_identifier) \
                            and paragraphs[-1][0].startswith(multi_quote_identifier)
                        if len(paragraphs) == 0 or \
                                (num_newline_count > 0 and not multi_line_quotes):
                            paragraphs.append([])
                        num_newline_count = 0
                        paragraphs[-1].append(line_trim)
        if not found_files:
            raise RuntimeError('no files found')
        if book_key is None:
            raise RuntimeError('no book key found')
        class_name = class_map[book_key]
        logger.info(
            f'number of paragraphs in class "{class_name}": {len(paragraphs)}')
        paragraphs = [[normalize_sentence(sentence) for sentence in paragraph]
                      for paragraph in paragraphs]
        data = pd.concat([
            data,
            pd.DataFrame({
                paragraph_key: paragraphs,
                label_key: [class_name] * len(paragraphs),
                class_key: class_count
            })
        ],
                         ignore_index=True)
        label_list.append(book_key)
        class_count += 1

    data.to_csv(clean_data_path, index=False)
    with open(classes_path, 'w') as classes_file:
        label_list_str = [elem.name for elem in label_list]
        yaml.dump(label_list_str, classes_file)

    return data, label_list
Пример #14
0
def clean(
        clean_data_basename: Optional[str] = default_file_name
) -> pd.DataFrame:
    """
    data cleaning
    """
    data: pd.DataFrame = pd.DataFrame()

    get_from_disk = clean_data_basename is not None

    if not get_from_disk:
        clean_data_basename = default_file_name

    clean_data_path = file_path_relative(clean_data_basename)

    if get_from_disk and exists(clean_data_path):
        logger.info(f'reading data from {clean_data_path}')
        data = pd.read_csv(clean_data_path)
        return data

    data: pd.DataFrame = pd.DataFrame()

    # iterate over the files
    for class_val, file_path in enumerate([
            file_path_relative(f'{part_2_data_folder}/negative.review'),
            file_path_relative(f'{part_2_data_folder}/positive.review')
    ]):
        root: Optional[etree._Element] = None
        with open(file_path, 'rb') as current_file:
            parser = etree.XMLParser(recover=True)
            # parse the xml
            root = etree.fromstring(
                f'<?xml version="1.0"?><root_elem>{current_file.read()}</root_elem>',
                parser=parser)

        reviews: List[str] = []

        # find all of the review_text tags recursively
        for elem in root.findall('.//review_text'):
            cast_elem: etree._Element = cast(etree._Element, elem)
            decoded_text: str = literal_eval(f"'{cast_elem.text}'")
            trimmed_text = decoded_text.strip()
            reviews.append(trimmed_text)

        class_name: str = reviews_class_map[class_val]

        logger.info(
            f'number of reviews in class "{class_name}": {len(reviews)}')

        # create dataframe
        data = pd.concat([
            data,
            pd.DataFrame({
                review_key: reviews,
                label_key: class_name,
                class_key: class_val
            })
        ],
                         ignore_index=True)

    data.to_csv(clean_data_path, index=False)

    return data