Python preprocess示例，preprocessing.text_preprocessor.preprocess Python示例

示例#1

0

显示文件

    def predict(self, lyrics: str) -> Optional[Tuple[np.ndarray, np.ndarray]]:
        lyrics_fragments = fragmentize_text(lyrics)
        lyrics_fragments = [
            preprocess(fragment,
                       remove_punctuation=True,
                       remove_text_in_brackets=True)
            for fragment in lyrics_fragments
        ]
        if self._removing_stop_words:
            lyrics_fragments = [
                remove_stop_words(fragment) for fragment in lyrics_fragments
            ]
        if self._lemmatization:
            lyrics_fragments = [
                lemmatize_text(fragment) for fragment in lyrics_fragments
            ]
        remove_empty_fragments(lyrics_fragments)

        if not lyrics_fragments:
            return None
        else:
            x, x_lens = self._get_embeddings(lyrics_fragments)
            result = self(x, x_lens)
            probs = torch.squeeze(result)
            label = probs.argmax(dim=-1, keepdim=True)
            return label.data.numpy(), probs.data.numpy()

示例#2

0

显示文件

文件： fragmentized_lyric_dataset.py 项目： wojtek11530/song_lyric_classification

    def _get_fragmentized_lyrics_and_emotion_data(
            song_df: pd.DataFrame, removing_stop_words: bool,
            lemmatization: bool) -> Tuple[np.ndarray, List[List[str]]]:
        emotion_labels = list(song_df['emotion_4Q'].values)
        lyrics_data = song_df['lyrics'].values
        lyrics_data = [fragmentize_text(lyrics) for lyrics in lyrics_data]
        lyrics_data = [[
            preprocess(fragment,
                       remove_punctuation=True,
                       remove_text_in_brackets=True) for fragment in fragments
        ] for fragments in lyrics_data]
        if removing_stop_words:
            lyrics_data = [[
                remove_stop_words(fragment) for fragment in fragments
            ] for fragments in lyrics_data]
        if lemmatization:
            lyrics_data = [[
                lemmatize_text(fragment) for fragment in fragments
            ] for fragments in lyrics_data]
        for fragments in lyrics_data:
            remove_empty_fragments(fragments)
        FragmentizedLyricsDataset._remove_records_without_fragments(
            emotion_labels, lyrics_data)

        emotion_data = label_encoder.transform(emotion_labels)
        return emotion_data, lyrics_data

示例#3

0

显示文件

    def _preprocess_lyrics_in_df(self, song_df: pd.DataFrame, lemmatization: bool, removing_stop_words: bool):
        song_df['lyrics'] = song_df['lyrics'].apply(
            lambda x: preprocess(x, remove_punctuation=True, remove_text_in_brackets=True))

        if removing_stop_words:
            song_df['lyrics'] = song_df['lyrics'].apply(lambda x: remove_stop_words(x))

        if lemmatization:
            song_df['lyrics'] = song_df['lyrics'].apply(lambda x: lemmatize_text(x))

        song_df = song_df[song_df['lyrics'] != '']
        return song_df

示例#4

0

显示文件

    def predict(self, lyrics: str) -> Optional[Tuple[np.ndarray, np.ndarray]]:
        lyrics = preprocess(lyrics, remove_punctuation=True, remove_text_in_brackets=True)
        if self._removing_stop_words:
            lyrics = remove_stop_words(lyrics)
        if self._lemmatization:
            lyrics = lemmatize_text(lyrics)

        if lyrics == '':
            return None
        else:
            padded_embeddings, length = self._get_padded_embeddings_sequence_and_length(lyrics)
            res = torch.squeeze(self(padded_embeddings, length))
            probs = torch.softmax(res, dim=-1)
            label = probs.argmax(dim=-1, keepdim=True)
            return label.data.numpy(), probs.data.numpy()

示例#5

0

显示文件

def create_fasttext_model(large_dataset: bool,
                          remove_stopwords: bool,
                          lemmatization: bool,
                          dim: int = 200) -> None:
    model_filename = 'fasttext_model_' + str(dim)

    if large_dataset:
        train_dataset_filepath = os.path.join(PROJECT_DIR, 'datasets',
                                              'lyrics-data', 'lyrics-data.csv')
        df = pd.read_csv(train_dataset_filepath, index_col=0)
        df = df[df['Idiom'] == 'ENGLISH']
        lyric_column_name = 'Lyric'
        model_filename += '_large'
    else:
        train_dataset_filepath = os.path.join(PROJECT_DIR, 'datasets',
                                              'train_dataset.csv')
        df = pd.read_csv(train_dataset_filepath, index_col=0)
        lyric_column_name = 'lyrics'

    df[lyric_column_name] = df.apply(
        lambda x: preprocess(x[lyric_column_name],
                             remove_punctuation=True,
                             remove_text_in_brackets=True),
        axis=1)
    if remove_stopwords:
        df[lyric_column_name] = df.apply(
            lambda x: remove_stop_words(x[lyric_column_name]), axis=1)
        model_filename += '_stopwords_removed'

    if lemmatization:
        df[lyric_column_name] = df.apply(
            lambda x: lemmatize_text(x[lyric_column_name]), axis=1)
        model_filename += '_lemmatization'

    model_filename += '.bin'
    lyrics_data = df[lyric_column_name].values

    with open(TEMP_LYRICS_FILENAME, 'w', encoding='utf-8') as f:
        for lyric in lyrics_data:
            f.write(lyric)

    model = fasttext.train_unsupervised(TEMP_LYRICS_FILENAME, dim=dim, minn=2)
    model_output = os.path.join(PROJECT_DIR, 'models', 'word_embedding',
                                'saved_models', model_filename)
    model.save_model(model_filename)
    shutil.move(model_filename, model_output)
    os.remove(TEMP_LYRICS_FILENAME)