class Pipeline(BaseEstimator, TransformerMixin): """ """ def __init__(self, numeric, id=None, target=None, categorical=None, verbose=0): self.created_features = None self.id = id self.target = target self.categorical = categorical self.numeric = numeric self.verbose = verbose self.feature_generator = None self.preprocessor = None def fit_transform(self, df, y=None, **fit_params): with Timer('pipelines.Pipeline.fit_transform:', self.verbose): self.feature_generator = FeatureGenerator( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) df_features = self.feature_generator.fit_transform(df) self.preprocessor = Preprocessor( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) x = self.preprocessor.fit_transform(df_features) return x def transform(self, df): with Timer('pipelines.Pipeline.transform:', self.verbose): if self.feature_generator is None: raise NotFittedError( f'feature_generator = {self.feature_generator}') if self.preprocessor is None: raise NotFittedError(f'preprocessor = {self.preprocessor}') df_features = self.feature_generator.transform(df) x = self.preprocessor.transform(df_features) return x def fit(self, x, y=None, **fit_params): return self def get_feature_names(self): return self.created_features
def custom_scorer(Y_true, Y_pred, **kwargs): note_true, dur_true = TeacherGenerator.y_to_note_dur( Y_true.squeeze(), sampler=TeacherGenerator.take_argmax) note_pred, dur_pred = TeacherGenerator.y_to_note_dur( Y_pred.squeeze(), sampler=TeacherGenerator.take_argmax) feat_true = FeatureGenerator.construct_single_feature(note_true, dur_true) feat_pred = FeatureGenerator.construct_single_feature(note_pred, dur_pred) return np.sqrt(np.sum(np.square(feat_true-feat_pred)))
def fit_transform(self, df, y=None, **fit_params): with Timer('pipelines.Pipeline.fit_transform:', self.verbose): self.feature_generator = FeatureGenerator( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) df_features = self.feature_generator.fit_transform(df) self.preprocessor = Preprocessor( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) x = self.preprocessor.fit_transform(df_features) return x
def make_validation_datasets(): feature_generator = FeatureGenerator() with open("datasets/positions_chunks.json00") as data: X, yraw, rnds = make_dataset(data, feature_generator) joblib.dump((X, yraw, rnds), "datasets/positions_1_squares_flags.joblib", compress=2) feature_generator = FeatureGenerator(attacked_squares=True) with open("datasets/positions_chunks.json00") as data: X, yraw, rnds = make_dataset(data, feature_generator) joblib.dump((X, yraw, rnds), "datasets/positions_1_squares_flags_attackers.joblib", compress=2) feature_generator = FeatureGenerator(attacked_squares=True, pins=True) with open("datasets/positions_chunks.json00") as data: X, yraw, rnds = make_dataset(data, feature_generator) joblib.dump((X, yraw, rnds), "datasets/positions_1_squares_flags_attackers_pins.joblib", compress=2)
def make_inferences(lr, X, dur_predict, sampler): inferences = [] while get_inferenced_time(inferences) < dur_predict: Y = lr.predict(X.reshape(1, -1)).squeeze() inform_output(Y, inferences) inference = Inference( TeacherGenerator.y_to_note_dur(Y.squeeze(), sampler=sampler)) inferences.append(inference) X = np.hstack( (X[6:, ...], FeatureGenerator.construct_single_feature(inference.note, inference.duration))) # out = np.array([]) # for inf in inferences: # out = np.append(out, np.repeat(inf.note, inf.duration)) return inferences
def make_inferences(lr, X, dur_predict, sampler): inferences = [] while get_inferenced_time(inferences) < dur_predict: Y = lr.predict(X.reshape(1, -1)) if len(inferences) < 4: inference = Inference( TeacherGenerator.y_to_note_dur( Y.squeeze(), sampler=sampler) ) else: prev_notes = [] min_note = TeacherGenerator._min_note for i in range(3): prev_notes.append(inferences[-4+i].note - min_note + 1) P = Y.squeeze() p = P[:-19] d = np.ones(len(P)-len(p)) # voice 0 range = (54,76) # voice 1 = (45,71) # voice 2 range = (40,62) # voice 3 range = (28,54) if prev_notes[2] > 0: #v3 if min_note == 28: oct = 0 #v2 elif min_note == 40: oct = 12 #v1 elif min_note == 45: oct = 12 #v0 else: oct = 12*2 if prev_notes[2] == 38 + oct - min_note: pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = 37 + oct - min_note, scale = 1) elif prev_notes[2] == 37 + oct - min_note and prev_notes[1] == 38 + oct - min_note: pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = 40 + oct - min_note, scale = 1) elif prev_notes[2] == 40 + oct - min_note and prev_notes[1] == 37 + oct - min_note and prev_notes[0] == 38 + oct - min_note: pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = 39 + oct - min_note, scale = 1) else: pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = prev_notes[2]+1 , scale = 1) p = p*(pdf*100) p[prev_notes[2]] = 0 P = np.concatenate((p,d)) inference = Inference( TeacherGenerator.y_to_note_dur( P, prev_notes, TeacherGenerator._min_note, sampler=sampler) ) inferences.append(inference) X = np.hstack(( X[6:, ...], FeatureGenerator.construct_single_feature( inference.note, inference.duration ) )) # out = np.array([]) # for inf in inferences: # out = np.append(out, np.repeat(inf.note, inf.duration)) return inferences
import os from glob import glob import numpy as np from tqdm import tqdm from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn from tensorflow.contrib.training import HParams from tensorflow.contrib.learn import RunConfig from lib import create_estimator, model_dir, POSSIBLE_LABELS, params, id2name, FINGERPRINT_KEY, getMfcc, getTransformedAudioLocal from features import FeatureGenerator featureGenerator = FeatureGenerator(params) TEST_BATCH_SIZE=64 TEST_DATA_PATHS = glob('../../data/test/audio/*wav') def test_data_generator(): for path in TEST_DATA_PATHS: fname = os.path.basename(path) result = dict(fname=np.string_(fname)) audio_options = dict( fname=path, desired_samples=16000, fg_vol=1, bg_data=[], bg_vol=0, clip_min=-1.0, clip_max=1.0, time_shift_samples=0,
out = \ {sampler: [None for _ in range(no_top)] for sampler in samplers} all_voice_inferences = \ {sampler: [[] for _ in range(no_top)] for sampler in samplers} log = np.array( ['voice', 'experiment', 'alpha', 'window size', 'mean score']).reshape(1, -1) for voice in voices: print('\n-------- VOICE %s --------' % voice) # Transform data to input and teacher matrices notes, durations = transform.encode_duration(raw_input, voice) features = FeatureGenerator.construct_features(notes, durations) # X, indices = transform.windowed(features, window_size=windows[0]) # Y = TeacherGenerator.construct_teacher(notes, durations, indices) # Train a ridge regression model # lr = obtain_optimal_model(X[:-1, ...], Y, alphas) top, nlog = obtain_optimal_model(features, notes, durations, alphas, windows, log, voice) log = nlog for (idx, model) in enumerate(top): no = len(top) - idx lr = model[3] X = model[4]
def create_combined_review_data_set(review_file_name): """ Derp """ data = load_json(review_file_name) X = [] y = [] # Pre-process all features fg = FeatureGenerator(data) for idx, datum in enumerate(data): # Our labels are the star counts y.append(int(datum['stars'])) business_id = datum['business_id'] # Our features are everything else we can get our hands on feature_vector = [] # Add as many features as we can think of # Features must be NUMERIC -- ints or floats! # DO NOT INCLUDE THE 'STARS' CATEGORY feature_vector.append(int(datum['votes']['cool'])) feature_vector.append(int(datum['votes']['funny'])) feature_vector.append(int(datum['votes']['useful'])) # TextBlob processing blob = fg.get_blob(idx) words = blob.words.lower().singularize() # TODO: add features of selected word counts # need to do some processing to figure out which words matter # # the feature generation is separated into a separate class # because some features may need to reference the context of # the entire dataset before determining it's features # (e.g. counts of words that are most widespread across all data) # # feature generation can potentially look up yelp user accounts # which should be done through the FeatureGenerator class # Add feature vector to list of feature vectors feature_vector.append(fg.generate_subjectivity(idx)) feature_vector.append(fg.generate_polarity(idx)) feature_vector.append(fg.generate_length(idx)) feature_vector.append(fg.generate_num_sentences(idx)) feature_vector.append(fg.generate_avg_sentence_len(idx)) feature_vector.append(fg.generate_count_exclamation(idx)) feature_vector.append(fg.generate_punctuation_to_sentence_ratio(idx)) feature_vector.append(fg.generate_number_of_all_cap_words(idx)) feature_vector.append(fg.generate_similarity_between_words(idx, 1)) feature_vector.append(fg.generate_similarity_between_words(idx, 2)) feature_vector.append(fg.generate_similarity_between_words(idx, 3)) feature_vector.append(fg.generate_similarity_between_words(idx, 4)) feature_vector.append(fg.generate_similarity_between_words(idx, 5)) feature_vector.append(fg.generate_average_stars_cluster(idx, business_id)) feature_vector.append(fg.generate_num_businesses_in_area(idx, business_id)) feature_vector.append( fg.generate_number_of_tips( idx, datum['user_id'], business_id) ) feature_vector.append( fg.generate_business_latitude( idx, business_id) ) feature_vector.append( fg.generate_business_longitude( idx, business_id) ) X.append(feature_vector) return DataSet(X, y)
def getBgVol(background_frequency, background_volume_range): if np.random.uniform(0, 1) < background_frequency: return np.random.uniform(0, background_volume_range) else: return 0 # print('bck vol: ', getBckVol(0.5, 0.5)) # print('bck vol: ', getBckVol(0.5, 0.5)) # print('bck vol: ', getBckVol(0.5, 0.5)) # print('bck vol: ', getBckVol(0.5, 0.5)) ##========================================================= ## Actual computations start here ##========================================================= featureGenerator = FeatureGenerator(params, getBgFileNames(DATADIR)) train_meta_list, val_meta_list = get_metadata_lists(DATADIR) augmentWithSilence(train_meta_list, SILENCE_PCT) augmentWithSilence(val_meta_list, SILENCE_PCT) print('Augumented sizes: Train: {}. Val: {}'.format(len(train_meta_list), len(val_meta_list))) train_input_fn = generator_input_fn( x=data_generator_fn(train_meta_list, BG_PARAMS, 'train'), target_key=TARGET_KEY, batch_size=BATCH_SIZE, shuffle=True, num_epochs=None, queue_capacity=3 * BATCH_SIZE + 10, num_threads=1, )