def main(args): print(create_args_str(args)) lang, audio, trans, keras, ds, ds_alpha, ds_trie, lm, vocab, target_dir, normalize, gpu = setup(args) print(f'all artifacts will be saved to {target_dir}') lm = load_lm(lm) if lm else None vocab = load_vocab(vocab) if vocab else None audio_bytes, sample_rate, transcript, language = preprocess(audio, trans, lang, norm_transcript=normalize) voiced_segments = vad(audio_bytes, sample_rate) df_alignments = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='en', ds_path=ds, ds_alpha_path=ds_alpha, ds_trie_path=ds_trie, keras_path=keras, lm=lm, vocab=vocab, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir) df_stats = calculate_stats(df_alignments, ds, transcript) create_demo_files(target_dir, audio, transcript, df_alignments, df_stats) print() print_dataframe(df_stats) print() stats_csv = join(target_dir, 'stats.csv') print(f'Saving stats to {stats_csv}') df_alignments.to_csv(stats_csv)
def predict(new_data): # Load in pickled model with open('model_2.pkl', 'rb') as f: model = pickle.load(f) # read in sample data and store as dataframe df = pd.json_normalize(new_data) # preprocess sample sample_clean = p.preprocess(df).prep() # REMOVE WHEN LIVE WITH API (data will never have label information) if 'acct_type' in sample_clean.columns.to_list(): sample_clean.drop('acct_type', axis=1, inplace=True) # REMOVE IF/WHEN WE DEBUG PIPELINE (Removes if there's an extra column (bug??)) if sample_clean.shape[0] > 1: sample_clean.drop(0, axis=0, inplace=True) # schema that model was trained on schema = [ 'delivery_method', 'fb_published', 'has_analytics', 'has_header', 'has_logo', 'name_length', 'num_order', 'num_payouts', 'org_facebook', 'org_twitter', 'show_map', 'user_type', 'approx_payout_date', 'body_length', 'channels', 'event_created', 'event_end', 'event_published', 'event_start', 'gts', 'sale_duration', 'sale_duration2', 'user_age', 'venue_latitude', 'venue_longitude', 'currency_CAD', 'currency_EUR', 'currency_GBP', 'currency_MXN', 'currency_NZD', 'currency_USD', 'listed_y', 'payout_type_ACH', 'payout_type_CHECK' ] # create df with sample data with appropriate schema final = pd.DataFrame(data=sample_clean, columns=schema) # fill NaNs final.fillna(0, inplace=True) # separate features and label X = final.copy() return model.predict_proba(X)
def gen_training_data(): tables = load_train_data_new() prior, wikigraph, entity_context, weight, all_entities = preprocess() features = Features(prior, wikigraph, entity_context, weight, all_entities) # positive = [] # negative = [] count = 0 count_miss = 0 f = open('feat.extracted.txt', 'w') for table in tables: # candidate_generation(table, prior) for cell in table.cells: if not cell.blank: count = count + 1 golden_feature = None negative_features = [] # print 'surface: ' + cell.surface + ' target: ' + cell.label for mention in cell.mentions: ft = features.extract_features(table, cell.pos, mention, mention.entity) print ft mention.feature = np.asarray(ft) # print mention.entity, if mention.entity == cell.label: golden_feature = mention.feature else: negative_features.append(mention.feature) # print '\n' if golden_feature is None: count_miss = count_miss + 1 print 'not found' continue f.write('\t'.join([str(s) for s in golden_feature]) + '\n') for neg in negative_features: f.write('\t'.join([str(s) for s in neg]) + '\n') f.write('\n') print count, count_miss f.close()
import pipeline import pandas as pd df = pipeline.read_load('/Users/erhla/Downloads/projects_2012_2013.csv') df['date_posted'] = pd.to_datetime(df['date_posted']) df['datefullyfunded'] = pd.to_datetime(df['datefullyfunded']) df['days_to_fund'] = df['datefullyfunded'] - df['date_posted'] df['funded_within_60_days'] = pd.get_dummies(df['days_to_fund'] <= pd.Timedelta('60 days'), drop_first=True) df = pipeline.preprocess(df, ['students_reached'], ['primary_focus_area', 'resource_type', 'grade_level']) feature_dict = {'students_reached': 'discretized', 'total_price_including_optional_support': 'discretized', 'school_charter': 'dummy', 'school_magnet': 'dummy', 'eligible_double_your_impact_match': 'dummy', 'teacher_prefix': 'dummy', 'poverty_level': 'dummy', 'grade_level': 'dummy', 'primary_focus_area': 'dummy', 'resource_type': 'dummy' } df, feature_ls = pipeline.generate_features(df, feature_dict, 10) pipeline.build_models(df, feature_ls, 'funded_within_60_days', ['DT'])
def main(args): print(create_args_str(args)) demo_files, target_dir, keras_path, ds_path, ds_alpha, ds_trie, lm_path, vocab_path, normalize, gpu = setup( args) num_files = len(demo_files) print( f'Processing {num_files} audio/transcript samples. All results will be written to {target_dir}' ) lm = load_lm(lm_path) if lm_path else None vocab = load_vocab(vocab_path) if vocab_path else None stats_keras, stats_ds = [], [] for i, (audio, transcript) in enumerate(demo_files): print( '-----------------------------------------------------------------' ) print(f'{i + 1}/{num_files}: Evaluating pipeline on {audio}') print( '-----------------------------------------------------------------' ) demo_id = splitext(basename(audio))[0] target_dir_ds = join(target_dir, demo_id + '_ds') target_dir_keras = join(target_dir, demo_id + '_keras') audio_bytes, sample_rate, transcript, language = preprocess( audio, transcript, 'en', norm_transcript=normalize) voiced_segments = vad(audio_bytes, sample_rate) df_alignments_ds = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='en', ds_path=ds_path, ds_alpha_path=ds_alpha, ds_trie_path=ds_trie, lm_path=lm, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir_ds) df_stats_ds = calculate_stats(df_alignments_ds, ds_path, transcript) df_alignments_keras = pipeline( voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='en', keras_path=keras_path, lm=lm, vocab=vocab, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir_keras) df_stats_keras = calculate_stats(df_alignments_keras, keras_path, transcript) # average similarity between Keras and DeepSpeech alignments av_similarity = np.mean([ levenshtein_similarity(al_keras, al_ds) for (al_keras, al_ds) in zip(df_alignments_keras['alignment'], df_alignments_ds['alignment']) ]) df_stats_ds['similarity'] = av_similarity df_stats_keras['similarity'] = av_similarity stats_ds.append(df_stats_ds) stats_keras.append(df_stats_keras) create_demo_files(target_dir_ds, audio, transcript, df_alignments_ds, df_stats_ds) create_demo_files(target_dir_keras, audio, transcript, df_alignments_keras, df_stats_keras) df_keras = pd.concat(stats_keras) csv_keras = join(target_dir, 'performance_keras.csv') df_keras.to_csv(csv_keras) df_ds = pd.concat(stats_ds) csv_ds = join(target_dir, 'performance_ds.csv') df_ds.to_csv(csv_ds) print(f'summary saved to {csv_keras}') visualize_pipeline_performance(csv_keras, csv_ds, silent=True) update_index(target_dir, lang='en', num_aligned=len(demo_files), df_keras=df_keras, keras_path=keras_path, df_ds=df_ds, ds_path=ds_path, lm_path=lm_path, vocab_path=vocab_path) print(f'Done! Demos have been saved to {target_dir}')
y_col = 'funded_within_60_days' feature_dict = {'students_reached': 'discretized', 'total_price_including_optional_support': 'discretized', 'school_charter': 'dummy', 'school_magnet': 'dummy', 'eligible_double_your_impact_match': 'dummy', 'teacher_prefix': 'dummy', 'poverty_level': 'dummy', 'grade_level': 'dummy', 'primary_focus_area': 'dummy', 'resource_type': 'dummy' } results = [] for train_start in train_start_dates: test, train = pipeline.time_split(df, date_col, train_start, test_length, test_train_offset) train = pipeline.preprocess(train, cols_to_fill, cols_to_drop_nas) test = pipeline.preprocess(test, cols_to_fill, cols_to_drop_nas) train, feature_ls = pipeline.generate_features(train, feature_dict, 10) test, feature_ls2 = pipeline.generate_features(test, feature_dict, 10) x_cols = list(set(feature_ls) & set(feature_ls2)) #include only feature columns which appear in both testing/training eval_metrics = pipeline.build_models(test[x_cols], test[y_col], train[x_cols], train[y_col]) eval_metrics['train_start'] = train_start results.append(eval_metrics) total = pd.concat(results) total.to_excel('results.xlsx')
import pipeline df = pipeline.read_load('/home/erhla/Downloads/credit-data.csv') pipeline.explore(df, ['PersonID', 'zipcode']) df = pipeline.preprocess(df, ['PersonID', 'zipcode']) df = pipeline.generate_features(df, 'SeriousDlqin2yrs', 'dummy') df = pipeline.generate_features(df, 'MonthlyIncome', 'discretized', 10) #categorical columns df['zipcode'] = df['zipcode'].astype('category') model, x_test, y_test = pipeline.build_classifier(df, 'SeriousDlqin2yrs', 0.2, 10, 5) pipeline.evaluate_classifier(model, x_test, y_test)