示例#1
0
def run(clf, output_csv_path):
    parser = argparse.ArgumentParser(description='NDSC Text Classifier')
    parser.add_argument(
        '--mode',
        type=str,
        default='train',
        metavar='N',
        help='train or test (for submission mode) (default: train)')
    args = parser.parse_args()

    print("\nRunning {}...".format(clf.__name__))
    clf = clf()
    ps = PorterStemmer()

    for cat in ['beauty_image', 'fashion_image', 'mobile_image']:
        df = pd.read_csv(os.path.join(data_path, 'train_' + cat + '.csv'))
        # df2 = pd.read_csv('./translations.txt', sep=';')
        # df['title'] = df2['title']

        # Text cleaning
        df['title'] = text_utils.clean_text(df['title'], stopwords)

        # Vector
        vectorizer = TfidfVectorizer(strip_accents='unicode',
                                     analyzer='word',
                                     token_pattern=r'\w{1,}',
                                     lowercase=True,
                                     max_features=feature_length)

        if args.mode == 'train':
            _, _, X_train, y_train, X_val, y_val = text_utils.data_split(
                df, seed)
            train_vectors = vectorizer.fit_transform(X_train)
            val_vectors = vectorizer.transform(X_val)
            print("Feature size:", train_vectors.shape)
            # Train
            clf.fit(train_vectors, y_train)
            predicted = clf.predict(val_vectors)
            print("Accuracy for {}: {:.2f}%".format(
                cat, accuracy_score(y_val, predicted)))

        elif args.mode == 'test':
            # Only for train on ALL for submission testing
            X_train, y_train = df['title'].values, df['Category'].values
            train_vectors = vectorizer.fit_transform(X_train)

            test_df = pd.read_csv('./test_' + cat + '.csv')
            X_test = test_df['title'].values
            test_vectors = vectorizer.transform(X_test)

            predicted = clf.predict(test_vectors)
            print(predicted)
            print(test_df['itemid'].values)
            with open(output_csv_path, 'a') as f:
                for i in range(len(predicted)):
                    row = '{},{}\n'.format(test_df['itemid'][i], predicted[i])
                    f.write(row)
        else:
            raise Exception("Please enter mode as 'train' or 'test'")
示例#2
0
def text2seq(text, type='char'):
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    clean_char = clean_text(text.rstrip())

    if type == 'char':
        seq = seq2id(clean_char, symbol_to_id)
    else:
        clean_phone = []
        for s in g2p(clean_char.lower()):
            if '@' + s in symbol_to_id:
                clean_phone.append('@' + s)
            else:
                clean_phone.append(s)
        seq = seq2id(clean_phone, symbol_to_id)

    return seq
示例#3
0
def process_song(row):
    """
    Applied to a DataFrame to clean lyrics and get word count, it also makes any song with lyrics in another
    language be returned with an np.nan row (missing data to easily remove later)

    """
    try:
        lyrics = row['text']
        cleaned_lyrics = text_utils.clean_text(lyrics)
        row['cleaned_lyrics'] = cleaned_lyrics
        row['old_word_count'] = len(lyrics.strip().split())
        row['new_word_count'] = len(cleaned_lyrics.strip().split())
    except Exception as e:
        print(e)
        row['text'] = np.nan
    return row
示例#4
0
def precompute_char_phone(path):

    metadata_file = os.path.join(path, 'metadata.csv')
    char_folder = os.path.join(path, 'chars')
    phone_folder = os.path.join(path, 'phones')
    if not os.path.isdir(char_folder):
        os.makedirs(char_folder)
    if not os.path.isdir(phone_folder):
        os.makedirs(phone_folder)
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    g2p = G2p()

    data = {}
    with codecs.open(metadata_file, 'r', 'utf-8') as metadata:
        for line in metadata.readlines():
            id, _, text = line.split("|")
            id = re.sub(r'"', '', id)
            clean_char = clean_text(text.rstrip())
            char_seq = seq2id(clean_char, symbol_to_id)
            clean_phone = []

            for s in g2p(clean_char.lower()):
                if '@' + s in symbol_to_id:
                    clean_phone.append('@' + s)
                else:
                    clean_phone.append(s)
            phone_seq = seq2id(clean_phone, symbol_to_id)

            char = {'char': clean_char, 'char_seq': char_seq}
            char_file = os.path.join(char_folder, id + '.pkl')
            with open(char_file, 'wb') as f:
                pkl.dump(char, f)

            phone = {'phone': clean_phone, 'phone_seq': phone_seq}
            phone_file = os.path.join(phone_folder, id + '.pkl')
            with open(phone_file, 'wb') as f:
                pkl.dump(phone, f)
示例#5
0
import text_utils
import tensorflow as tf
import emoji

tf.flags.DEFINE_string("input_file", "../Data/text8.txt",
                       "input file to pre-process")
tf.flags.DEFINE_string("output_file", "../Data/text8.txt.clean",
                       "Output file after pre-processing")

FLAGS = tf.flags.FLAGS

data_samples = list(open(FLAGS.input_file, "r").readlines())
data_samples = [emoji.demojize(s.strip()) for s in data_samples]

x_text = [text_utils.clean_text(sent) for sent in data_samples]

file_writer = open(FLAGS.output_file, "w", encoding="utf-8")
for line in x_text:
    file_writer.write("%s\n" % line)
file_writer.close()