Python preprocess 예제들, pipeline.preprocess Python 예제들

예제 #1

0

파일 보기

def main(args):
    print(create_args_str(args))
    lang, audio, trans, keras, ds, ds_alpha, ds_trie, lm, vocab, target_dir, normalize, gpu = setup(args)
    print(f'all artifacts will be saved to {target_dir}')

    lm = load_lm(lm) if lm else None
    vocab = load_vocab(vocab) if vocab else None

    audio_bytes, sample_rate, transcript, language = preprocess(audio, trans, lang, norm_transcript=normalize)
    voiced_segments = vad(audio_bytes, sample_rate)
    df_alignments = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript,
                             language='en',
                             ds_path=ds, ds_alpha_path=ds_alpha, ds_trie_path=ds_trie,
                             keras_path=keras, lm=lm, vocab=vocab,
                             force_realignment=args.force_realignment, align_endings=args.align_endings,
                             target_dir=target_dir)

    df_stats = calculate_stats(df_alignments, ds, transcript)
    create_demo_files(target_dir, audio, transcript, df_alignments, df_stats)

    print()
    print_dataframe(df_stats)
    print()

    stats_csv = join(target_dir, 'stats.csv')
    print(f'Saving stats to {stats_csv}')
    df_alignments.to_csv(stats_csv)

예제 #2

0

파일 보기

파일: predict.py 프로젝트: wmcfarlan/FraudDetection

def predict(new_data):
    # Load in pickled model
    with open('model_2.pkl', 'rb') as f:
        model = pickle.load(f)

    # read in sample data and store as dataframe
    df = pd.json_normalize(new_data)

    # preprocess sample
    sample_clean = p.preprocess(df).prep()

    # REMOVE WHEN LIVE WITH API (data will never have label information)
    if 'acct_type' in sample_clean.columns.to_list():
        sample_clean.drop('acct_type', axis=1, inplace=True)

    # REMOVE IF/WHEN WE DEBUG PIPELINE (Removes if there's an extra column (bug??))
    if sample_clean.shape[0] > 1:
        sample_clean.drop(0, axis=0, inplace=True)

    # schema that model was trained on
    schema = [
        'delivery_method', 'fb_published', 'has_analytics', 'has_header',
        'has_logo', 'name_length', 'num_order', 'num_payouts', 'org_facebook',
        'org_twitter', 'show_map', 'user_type', 'approx_payout_date',
        'body_length', 'channels', 'event_created', 'event_end',
        'event_published', 'event_start', 'gts', 'sale_duration',
        'sale_duration2', 'user_age', 'venue_latitude', 'venue_longitude',
        'currency_CAD', 'currency_EUR', 'currency_GBP', 'currency_MXN',
        'currency_NZD', 'currency_USD', 'listed_y', 'payout_type_ACH',
        'payout_type_CHECK'
    ]

    # create df with sample data with appropriate schema
    final = pd.DataFrame(data=sample_clean, columns=schema)
    # fill NaNs
    final.fillna(0, inplace=True)
    # separate features and label
    X = final.copy()
    return model.predict_proba(X)

예제 #3

0

파일 보기

파일: learn.py 프로젝트: lkq1992yeah/CompQA

def gen_training_data():
    tables = load_train_data_new()
    prior, wikigraph, entity_context, weight, all_entities = preprocess()
    features = Features(prior, wikigraph, entity_context, weight, all_entities)
    # positive = []
    # negative = []
    count = 0
    count_miss = 0
    f = open('feat.extracted.txt', 'w')
    for table in tables:
        # candidate_generation(table, prior)
        for cell in table.cells:
            if not cell.blank:
                count = count + 1
                golden_feature = None
                negative_features = []
                # print 'surface: ' + cell.surface + '  target: ' + cell.label
                for mention in cell.mentions:
                    ft = features.extract_features(table, cell.pos, mention,
                                                   mention.entity)
                    print ft
                    mention.feature = np.asarray(ft)
                    # print mention.entity,
                    if mention.entity == cell.label:
                        golden_feature = mention.feature
                    else:
                        negative_features.append(mention.feature)
                # print '\n'
                if golden_feature is None:
                    count_miss = count_miss + 1
                    print 'not found'
                    continue
                f.write('\t'.join([str(s) for s in golden_feature]) + '\n')
                for neg in negative_features:
                    f.write('\t'.join([str(s) for s in neg]) + '\n')
                f.write('\n')

    print count, count_miss
    f.close()

예제 #4

0

파일 보기

파일: hw3_donorschoose.py 프로젝트: erhla/machine-learning

import pipeline
import pandas as pd

df = pipeline.read_load('/Users/erhla/Downloads/projects_2012_2013.csv')

df['date_posted'] = pd.to_datetime(df['date_posted'])
df['datefullyfunded'] = pd.to_datetime(df['datefullyfunded'])
df['days_to_fund'] = df['datefullyfunded'] - df['date_posted']
df['funded_within_60_days'] = pd.get_dummies(df['days_to_fund'] <= pd.Timedelta('60 days'), drop_first=True)

df = pipeline.preprocess(df, ['students_reached'], ['primary_focus_area', 'resource_type', 'grade_level'])

feature_dict = {'students_reached': 'discretized',
                'total_price_including_optional_support': 'discretized',
                'school_charter': 'dummy',
                'school_magnet': 'dummy',
                'eligible_double_your_impact_match': 'dummy',
                'teacher_prefix': 'dummy',
                'poverty_level': 'dummy',
                'grade_level': 'dummy',
                'primary_focus_area': 'dummy',
                'resource_type': 'dummy'
               }
df, feature_ls = pipeline.generate_features(df, feature_dict, 10)
pipeline.build_models(df, feature_ls, 'funded_within_60_days', ['DT'])

예제 #5

0

파일 보기

파일: evaluate_pipeline_en.py 프로젝트: tiefenauer/ip9

def main(args):
    print(create_args_str(args))
    demo_files, target_dir, keras_path, ds_path, ds_alpha, ds_trie, lm_path, vocab_path, normalize, gpu = setup(
        args)
    num_files = len(demo_files)
    print(
        f'Processing {num_files} audio/transcript samples. All results will be written to {target_dir}'
    )

    lm = load_lm(lm_path) if lm_path else None
    vocab = load_vocab(vocab_path) if vocab_path else None

    stats_keras, stats_ds = [], []
    for i, (audio, transcript) in enumerate(demo_files):
        print(
            '-----------------------------------------------------------------'
        )
        print(f'{i + 1}/{num_files}: Evaluating pipeline on {audio}')
        print(
            '-----------------------------------------------------------------'
        )
        demo_id = splitext(basename(audio))[0]
        target_dir_ds = join(target_dir, demo_id + '_ds')
        target_dir_keras = join(target_dir, demo_id + '_keras')

        audio_bytes, sample_rate, transcript, language = preprocess(
            audio, transcript, 'en', norm_transcript=normalize)
        voiced_segments = vad(audio_bytes, sample_rate)

        df_alignments_ds = pipeline(voiced_segments=voiced_segments,
                                    sample_rate=sample_rate,
                                    transcript=transcript,
                                    language='en',
                                    ds_path=ds_path,
                                    ds_alpha_path=ds_alpha,
                                    ds_trie_path=ds_trie,
                                    lm_path=lm,
                                    force_realignment=args.force_realignment,
                                    align_endings=args.align_endings,
                                    target_dir=target_dir_ds)
        df_stats_ds = calculate_stats(df_alignments_ds, ds_path, transcript)

        df_alignments_keras = pipeline(
            voiced_segments=voiced_segments,
            sample_rate=sample_rate,
            transcript=transcript,
            language='en',
            keras_path=keras_path,
            lm=lm,
            vocab=vocab,
            force_realignment=args.force_realignment,
            align_endings=args.align_endings,
            target_dir=target_dir_keras)
        df_stats_keras = calculate_stats(df_alignments_keras, keras_path,
                                         transcript)

        # average similarity between Keras and DeepSpeech alignments
        av_similarity = np.mean([
            levenshtein_similarity(al_keras, al_ds)
            for (al_keras, al_ds) in zip(df_alignments_keras['alignment'],
                                         df_alignments_ds['alignment'])
        ])

        df_stats_ds['similarity'] = av_similarity
        df_stats_keras['similarity'] = av_similarity
        stats_ds.append(df_stats_ds)
        stats_keras.append(df_stats_keras)

        create_demo_files(target_dir_ds, audio, transcript, df_alignments_ds,
                          df_stats_ds)
        create_demo_files(target_dir_keras, audio, transcript,
                          df_alignments_keras, df_stats_keras)

    df_keras = pd.concat(stats_keras)
    csv_keras = join(target_dir, 'performance_keras.csv')
    df_keras.to_csv(csv_keras)

    df_ds = pd.concat(stats_ds)
    csv_ds = join(target_dir, 'performance_ds.csv')
    df_ds.to_csv(csv_ds)
    print(f'summary saved to {csv_keras}')

    visualize_pipeline_performance(csv_keras, csv_ds, silent=True)
    update_index(target_dir,
                 lang='en',
                 num_aligned=len(demo_files),
                 df_keras=df_keras,
                 keras_path=keras_path,
                 df_ds=df_ds,
                 ds_path=ds_path,
                 lm_path=lm_path,
                 vocab_path=vocab_path)

    print(f'Done! Demos have been saved to {target_dir}')

예제 #6

0

파일 보기

파일: hw5_donorschoose.py 프로젝트: erhla/machine-learning

y_col = 'funded_within_60_days'

feature_dict = {'students_reached': 'discretized',
                'total_price_including_optional_support': 'discretized',
                'school_charter': 'dummy',
                'school_magnet': 'dummy',
                'eligible_double_your_impact_match': 'dummy',
                'teacher_prefix': 'dummy',
                'poverty_level': 'dummy',
                'grade_level': 'dummy',
                'primary_focus_area': 'dummy',
                'resource_type': 'dummy'
               }

results = []

for train_start in train_start_dates:
    test, train = pipeline.time_split(df, date_col, train_start, test_length, test_train_offset)
    train = pipeline.preprocess(train, cols_to_fill, cols_to_drop_nas)
    test = pipeline.preprocess(test, cols_to_fill, cols_to_drop_nas)
    train, feature_ls = pipeline.generate_features(train, feature_dict, 10)
    test, feature_ls2 = pipeline.generate_features(test, feature_dict, 10)
    x_cols = list(set(feature_ls) & set(feature_ls2)) #include only feature columns which appear in both testing/training
    eval_metrics = pipeline.build_models(test[x_cols], test[y_col], train[x_cols], train[y_col])
    eval_metrics['train_start'] = train_start
    results.append(eval_metrics)
    
total = pd.concat(results)
total.to_excel('results.xlsx')

예제 #7

0

파일 보기

파일: hw2_financialdistress.py 프로젝트: erhla/machine-learning

import pipeline

df = pipeline.read_load('/home/erhla/Downloads/credit-data.csv')
pipeline.explore(df, ['PersonID', 'zipcode'])
df = pipeline.preprocess(df, ['PersonID', 'zipcode'])
df = pipeline.generate_features(df, 'SeriousDlqin2yrs', 'dummy')
df = pipeline.generate_features(df, 'MonthlyIncome', 'discretized', 10)

#categorical columns
df['zipcode'] = df['zipcode'].astype('category')

model, x_test, y_test = pipeline.build_classifier(df, 'SeriousDlqin2yrs', 0.2,
                                                  10, 5)
pipeline.evaluate_classifier(model, x_test, y_test)