Python clean_text示例，text_utils.clean_text Python示例

示例#1

0

显示文件

文件： linear_classifier.py 项目： Leekwanmeng/NDSC2019

def run(clf, output_csv_path):
    parser = argparse.ArgumentParser(description='NDSC Text Classifier')
    parser.add_argument(
        '--mode',
        type=str,
        default='train',
        metavar='N',
        help='train or test (for submission mode) (default: train)')
    args = parser.parse_args()

    print("\nRunning {}...".format(clf.__name__))
    clf = clf()
    ps = PorterStemmer()

    for cat in ['beauty_image', 'fashion_image', 'mobile_image']:
        df = pd.read_csv(os.path.join(data_path, 'train_' + cat + '.csv'))
        # df2 = pd.read_csv('./translations.txt', sep=';')
        # df['title'] = df2['title']

        # Text cleaning
        df['title'] = text_utils.clean_text(df['title'], stopwords)

        # Vector
        vectorizer = TfidfVectorizer(strip_accents='unicode',
                                     analyzer='word',
                                     token_pattern=r'\w{1,}',
                                     lowercase=True,
                                     max_features=feature_length)

        if args.mode == 'train':
            _, _, X_train, y_train, X_val, y_val = text_utils.data_split(
                df, seed)
            train_vectors = vectorizer.fit_transform(X_train)
            val_vectors = vectorizer.transform(X_val)
            print("Feature size:", train_vectors.shape)
            # Train
            clf.fit(train_vectors, y_train)
            predicted = clf.predict(val_vectors)
            print("Accuracy for {}: {:.2f}%".format(
                cat, accuracy_score(y_val, predicted)))

        elif args.mode == 'test':
            # Only for train on ALL for submission testing
            X_train, y_train = df['title'].values, df['Category'].values
            train_vectors = vectorizer.fit_transform(X_train)

            test_df = pd.read_csv('./test_' + cat + '.csv')
            X_test = test_df['title'].values
            test_vectors = vectorizer.transform(X_test)

            predicted = clf.predict(test_vectors)
            print(predicted)
            print(test_df['itemid'].values)
            with open(output_csv_path, 'a') as f:
                for i in range(len(predicted)):
                    row = '{},{}\n'.format(test_df['itemid'][i], predicted[i])
                    f.write(row)
        else:
            raise Exception("Please enter mode as 'train' or 'test'")

示例#2

0

显示文件

def text2seq(text, type='char'):
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    clean_char = clean_text(text.rstrip())

    if type == 'char':
        seq = seq2id(clean_char, symbol_to_id)
    else:
        clean_phone = []
        for s in g2p(clean_char.lower()):
            if '@' + s in symbol_to_id:
                clean_phone.append('@' + s)
            else:
                clean_phone.append(s)
        seq = seq2id(clean_phone, symbol_to_id)

    return seq

示例#3

0

显示文件

def process_song(row):
    """
    Applied to a DataFrame to clean lyrics and get word count, it also makes any song with lyrics in another
    language be returned with an np.nan row (missing data to easily remove later)

    """
    try:
        lyrics = row['text']
        cleaned_lyrics = text_utils.clean_text(lyrics)
        row['cleaned_lyrics'] = cleaned_lyrics
        row['old_word_count'] = len(lyrics.strip().split())
        row['new_word_count'] = len(cleaned_lyrics.strip().split())
    except Exception as e:
        print(e)
        row['text'] = np.nan
    return row

示例#4

0

显示文件

def precompute_char_phone(path):

    metadata_file = os.path.join(path, 'metadata.csv')
    char_folder = os.path.join(path, 'chars')
    phone_folder = os.path.join(path, 'phones')
    if not os.path.isdir(char_folder):
        os.makedirs(char_folder)
    if not os.path.isdir(phone_folder):
        os.makedirs(phone_folder)
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    g2p = G2p()

    data = {}
    with codecs.open(metadata_file, 'r', 'utf-8') as metadata:
        for line in metadata.readlines():
            id, _, text = line.split("|")
            id = re.sub(r'"', '', id)
            clean_char = clean_text(text.rstrip())
            char_seq = seq2id(clean_char, symbol_to_id)
            clean_phone = []

            for s in g2p(clean_char.lower()):
                if '@' + s in symbol_to_id:
                    clean_phone.append('@' + s)
                else:
                    clean_phone.append(s)
            phone_seq = seq2id(clean_phone, symbol_to_id)

            char = {'char': clean_char, 'char_seq': char_seq}
            char_file = os.path.join(char_folder, id + '.pkl')
            with open(char_file, 'wb') as f:
                pkl.dump(char, f)

            phone = {'phone': clean_phone, 'phone_seq': phone_seq}
            phone_file = os.path.join(phone_folder, id + '.pkl')
            with open(phone_file, 'wb') as f:
                pkl.dump(phone, f)

示例#5

0

显示文件

文件： preparing_data.py 项目： ssgalitsky/dpText

import text_utils
import tensorflow as tf
import emoji

tf.flags.DEFINE_string("input_file", "../Data/text8.txt",
                       "input file to pre-process")
tf.flags.DEFINE_string("output_file", "../Data/text8.txt.clean",
                       "Output file after pre-processing")

FLAGS = tf.flags.FLAGS

data_samples = list(open(FLAGS.input_file, "r").readlines())
data_samples = [emoji.demojize(s.strip()) for s in data_samples]

x_text = [text_utils.clean_text(sent) for sent in data_samples]

file_writer = open(FLAGS.output_file, "w", encoding="utf-8")
for line in x_text:
    file_writer.write("%s\n" % line)
file_writer.close()