def extract(): logging.info('Begin extract') """ # Use for batch parsing candidate_file_agg = list() for root, subdirs, files in os.walk(lib.get_conf('resume_directory')): folder_files = map(lambda x: os.path.join(root, x), files) # [os.path.join(root, x) for x in files] candidate_file_agg.extend(folder_files) """ candidate_file = list() candidate_file.append(sys.argv[1]) # Convert list to a pandas DataFrame observations = pandas.DataFrame(data=candidate_file, columns=['file_path']) logging.info('Found {} candidate file(s)'.format(len(observations.index))) # Subset candidate files to supported extensions observations['extension'] = observations['file_path'].apply(lambda x: os.path.splitext(x)[1]) observations = observations[observations['extension'].isin(lib.AVAILABLE_EXTENSIONS)] logging.info('Took candidate file(s) with appropriate file format(s). {} file(s) remain'. format(len(observations.index))) with open(candidate_file[0], 'r') as cv: # needs to be utf-8 encoded text = cv.read() # Attempt to extract text from files observations['text'] = text # observations['file_path'].apply(text_extract_utf8) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def load(observations, transformation_pipeline, trained_model): logging.info('Begin load') # Reference variables lib.get_temp_dir() observations_path = os.path.join(lib.get_temp_dir(), 'observations.csv') logging.info('Saving observations to path: {}'.format(observations_path)) observations.to_csv(observations_path, index=False) if transformation_pipeline is not None: transformation_pipeline_path = os.path.join( lib.get_temp_dir(), 'transformation_pipeline.pkl') logging.info('Saving transformation_pipeline to path: {}'.format( transformation_pipeline)) cPickle.dump(transformation_pipeline, open(transformation_pipeline, 'w+')) if trained_model is not None: trained_model_path = os.path.join(lib.get_temp_dir(), 'trained_model.pkl') logging.info( 'Saving trained_model to path: {}'.format(transformation_pipeline)) cPickle.dump(trained_model, open(trained_model_path, 'w+')) lib.archive_dataset_schemas('load', locals(), globals()) logging.info('End load') pass
def transform(observations, nlp): logging.info('Begin transform') print("Extracting name, email, phone, GPA, and dates of work experience") observations = observations.fillna('') observations['candidate_name'] = observations['text'].apply(lambda x: field_extraction.candidate_name_extractor(x, nlp)) observations['email'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) observations['phone'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) observations['GPA'] = observations['text'].apply(lambda x: field_extraction.gpa_extractor(x)) observations['years_experience'] = observations['Work'].apply(lambda x: field_extraction.years_of_experience(x)) observations['mos_experience'] = field_extraction.months_of_experience(observations['years_experience']) # observations['work_dates'] = observations['Work'].apply( # lambda x: field_extraction.spacy_extractor_by_type(str(x).replace('\n', '. '), nlp, 'DATE', 2)) # observations['uni'] = observations['Edu'].apply( # lambda x: field_extraction.spacy_extractor_by_type(str(x), nlp, 'ORG', 2)) observations = field_extraction.extract_fields(observations) # search for terms in whole resume # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations
def extract(): logging.info('Begin extract') observations = pandas.DataFrame() lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def transform(observations): logging.info('Begin transform') # Transform newsgroup20 data set # Newsgroup20: Extract article filename from document path observations['filename'] = observations['document_path'].apply(lambda x: ntpath.basename(x)) # Newsgroup20: Extract article category from document path observations['category'] = observations['document_path'].apply(lambda x: ntpath.basename(os.path.dirname(x))) # Newsgroup20: Extract article text (and strip article headers), from document path observations['text'] = observations['document_path'].apply(lambda x: lib.strip_header(open(x).readlines())) # Remove non-ascii characters observations['text'] = observations['text'].apply(lambda x: x.decode('ascii', errors='ignore')) # Newsgroup20: Convert text to normalized tokens. Unknown tokens will map to 'UNK'. observations['tokens'] = observations['text'].apply(simple_preprocess) # Newsgroup20: Create bigrams observations['bigrams'] = observations['text'].apply(lambda x: lib.find_ngrams(x, n=2)) # Newsgroup20: Create modeling text observations['modeling_text_list'] = observations['tokens'] + observations['bigrams'] observations['modeling_text'] = observations['modeling_text_list'].apply(lambda x: ' '.join(x)) lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations
def extract(): logging.info('Begin extract') candidate_file_agg = list() # for creating list of resume file paths for root, subdirs, files in os.walk(lib.get_conf( 'resume_directory')): # gets path to resumes from yaml file # os.walk(parentdir + '/data/input/example_resumes'): would do the same thing files = filter(lambda f: f.endswith(('.pdf', '.PDF')), files) # only read pdfs folder_files = map(lambda x: os.path.join(root, x), files) candidate_file_agg.extend(folder_files) observations = pd.DataFrame(data=candidate_file_agg, columns=['file_path']) # convert to df logging.info('Found {} candidate files'.format(len(observations.index))) observations['extension'] = observations['file_path'].apply( lambda x: os.path.splitext(x)[1]) # e.g. pdf or doc observations = observations[observations['extension'].isin( lib.AVAILABLE_EXTENSIONS)] logging.info( 'Subset candidate files to extensions w/ available parsers. {} files remain' .format(len(observations.index))) observations['text'] = observations['file_path'].apply( lib.convert_pdf) # get text from .pdf files # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) # saving the schema logging.info('End extract') return observations
def model(observations): logging.info('Begin model') # Resources vocabulary = set(itertools.chain.from_iterable(observations['modeling_text_list'])) vectorizer = CountVectorizer(vocabulary=vocabulary) # Create train, test sets msk = numpy.random.rand(len(observations)) < 0.8 train = observations[msk] test = observations[~msk] # Create X, y vectors X_train = vectorizer.fit_transform(train['modeling_text']).todense() y_train = train['category'] X_test = vectorizer.transform(test['modeling_text']).todense() y_test = test['category'] # Create, train model nb = GaussianNB() nb.fit(X_train, y_train) # Create predictions, using trained model test['preds'] = nb.predict(X_test) scores = nb.score(X_test, y_test) logging.info('Scores: {}'.format(scores)) lib.archive_dataset_schemas('model', locals(), globals()) logging.info('End model') return observations, vectorizer, nb, test
def extract(): logging.info('Begin extract') # Reference variables candidate_file_agg = list() # Create list of candidate files for root, subdirs, files in os.walk(lib.get_conf('resume_directory')): folder_files = map(lambda x: os.path.join(root, x), files) candidate_file_agg.extend(folder_files) # Convert list to a pandas DataFrame observations = pandas.DataFrame(data=candidate_file_agg, columns=['file_path']) logging.info('Found {} candidate files'.format(len(observations.index))) # Subset candidate files to supported extensions observations['extension'] = observations['file_path'].apply( lambda x: os.path.splitext(x)[1]) observations = observations[observations['extension'].isin( lib.AVAILABLE_EXTENSIONS)] logging.info( 'Subset candidate files to extensions w/ available parsers. {} files remain' .format(len(observations.index))) # Attempt to extract text from files observations['text'] = observations['file_path'].apply(text_extract_utf8) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def transform(observations, nlp): # TODO Docstring logging.info('Begin transform') # Extract candidate name observations['candidate_name'] = observations['text'].apply( lambda x: field_extraction.candidate_name_extractor(x, nlp)) # Extract contact fields observations['email'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) observations['phone'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) # Extract university observations['universities'] = observations['text'].apply( field_extraction.extract_universities) # Extract skills observations['skills'] = observations['text'].apply( field_extraction.extract_skills) # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, nlp
def transform(observations, nlp): # TODO Docstring logging.info('Begin transform') # Extract candidate name observations['candidate_name'] = observations['text'].apply( lambda x: extract_entities.candidate_name_extractor(x, nlp)) #Extract nationality observations['nationality'] = observations['text'].apply( lambda x: extract_entities.nationality_extractor(x, nlp)) # Extract contact fields observations['email'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.EMAIL_REGEX)) observations['phone'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.PHONE_REGEX)) observations['birthdate'] = observations['text'].apply( lambda x: lib.birthdate_match(x, extract_entities.BIRTHDATE_REGEX)) observations['unit_postcode'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.UNIT_POSTCODE_REGEX)) observations['url'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.URL_REGEX)) # Extract skills observations = extract_entities.extract_fields(observations) # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, nlp
def load(mapper, large_model): """ - Save mapper to pkl file - Save large model to h5py file :param mapper: Mapper, to translate pandas dataframe to usable numpy matrix :type mapper: DataFrameMapper :param large_model: A trained keras model :type large_model: keras.Model :return: """ logging.info('Begin load') # Save mapper to file cPickle.dump(mapper, open('../data/output/mapper.pkl', 'w+')) # Save model to file large_model.save('../data/output/large_model.h5py', 'w+') # Archive & return lib.archive_dataset_schemas('load', locals(), globals()) logging.info('End load') pass
def model(observations, mapper): logging.info('Begin model') cat_vars = ['air_store_id'] cont_vars = ['reserve_visitors'] date_vars = ['visit_datetime', 'reserve_datetime'] response_var = 'visitors' Xs, y, x_inputs, input_nub, output_nub = df_prep.create_model_layers( observations, mapper, cat_vars, cont_vars, date_vars, response_var) # Create model x = input_nub preds = output_nub(x) regression_model = Model(x_inputs, preds) opt = optimizers.Adam() regression_model.compile(loss=lib.root_mean_squared_log_error, optimizer=opt) regression_model.fit(Xs, y, batch_size=2**12, validation_split=.2) regression_model.save('..data/models/regression.csv') lib.archive_dataset_schemas('model', locals(), globals()) logging.info('End model') pass
def extract(): # Extract appropriate model char_model = load_model(filepath=lib.get_conf('generate_model_path')) # Extract posts to be completed observations = pandas.read_csv(lib.get_conf('post_seed_path')) logging.info('End extract') lib.archive_dataset_schemas('generate_extract', locals(), globals()) return char_model, observations
def model(observations): logging.info('Begin model') mapper = None transformation_pipeline = None trained_model = None lib.archive_dataset_schemas('model', locals(), globals()) logging.info('End model') return observations, transformation_pipeline, trained_model
def extract(): # TODO Extract # Extract all posts for given subreddit, going back given number of days logging.info('Downloading submissions from Reddit') observations = scrape_subreddit(lib.get_conf('subreddit'), lib.get_conf('history_num_days')) logging.info('Found {} submissions'.format(len(observations.index))) logging.info('End extract') lib.archive_dataset_schemas('extract', locals(), globals()) return observations
def extract(): logging.info('Begin extract') reservations = pandas.read_csv('../data/input/air_reserve.csv') visits = pandas.read_csv('../data/input/air_visit_data.csv') observations = pandas.merge(reservations, visits) observations = observations.sample(frac=1.0, replace=False) observations = observations.head(100000) lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def extract(): """ Extract necessary data / resources from upstream. This method will: - Validate that newsgroup data set is available, and read in - Validate that text embeddings are available, and read in - Validate that text to embedding index lookup is available, and read in :return: observations, embedding_matrix, word_to_index :rtype: (pandas.DataFrame, numpy.array, dict) """ logging.info('Begin extract') logging.info('Performing extract for batch: {}, from newgroup_path: {}' .format(lib.get_batch_name(), lib.get_conf('newsgroup_path'))) # Download resources # Confirm newsgroup data set is downloaded resources.download_newsgroup() # Confirm that embedding is downloaded resources.download_embedding() # Extract resources from file system # Newsgroup20: Get list of all candidate documents glob_pattern = os.path.join(lib.get_conf('newsgroup_path'), '*', '*') logging.info('Searching for glob_pattern: {}'.format(glob_pattern)) document_candidates = glob.glob(glob_pattern) # Newsgroup20: Create observations data set observations = pandas.DataFrame(document_candidates, columns=['document_path']) logging.info('Shape of observations data frame created from glob matches: {}'.format(observations.shape)) # Newsgroup20: Re-order rows observations = observations.sample(frac=1) # Newsgroup20: Subset number of observations, if it's a test run if lib.get_conf('test_run'): logging.info('Reducing file size for test run') observations = observations.head(100) logging.info('Test run number of records: {}'.format(len(observations.index))) # Embedding: Load embedding embedding_matrix, word_to_index = resources.create_embedding_matrix() logging.info('word_to_index max index: {}'.format(max(word_to_index.values()))) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations, embedding_matrix, word_to_index
def transform(observations, label_encoder): logging.info('Begin transform') # Feature engineering observations.columns = map(lambda x: '_'.join(x.lower().split()), observations.columns) observations['lat'] = observations['location_1'].apply( lambda x: eval(x)[0]) observations['long'] = observations['location_1'].apply( lambda x: eval(x)[1]) # TODO Feature engineering observations['is_manhattan'] = observations['borough'] == 'MANHATTAN' observations['is_ny_police'] = observations[ 'jurisdiction'] == 'N.Y. POLICE DEPT' # observations['occurence_epoch'] = pandas.to_datetime(observations['occurrence_datetime'], format='%m/%d/%y %I:%M:%S %p') # print observations['occurence_epoch'][0], type(observations['occurence_epoch'][0]) # observations['compstat_date'] = observations['compstat_year'].astype(str) + '-' + observations['compstat_month'].astype(str) + '-' + \ # observations['compstat_day'].astype(str) # # Dummy out response variable if label_encoder is None: label_encoder = lib.create_label_encoder(observations['offense']) observations['response'] = observations['offense'].apply( lambda x: label_encoder[x]) observations['is_grand_larceny'] = observations['offense'].apply( lambda x: x == 'GRAND LARCENY') logging.info('is_grand_larceny value counts: {}'.format( observations['is_grand_larceny'].value_counts())) lib.archive_dataset_schemas('transform', locals(), globals()) regressors = [ 'occurrence_day', 'occurrence_year', 'compstat_month', 'compstat_day', 'compstat_year', 'lat', 'long' ] response_var = 'response' # TODO Normalization should always be based on training set, not just set at hand for regressor in regressors: max_value = observations[regressor].max() min_value = observations[regressor].min() observations[regressor] = (observations[regressor] - min_value) / (max_value - min_value) regressors.extend(['is_manhattan', 'is_ny_police']) X = observations[regressors].as_matrix().astype(numpy.float32) y = numpy.array(observations[response_var].tolist()).astype(numpy.float32) logging.info('End transform') return observations, X, y, label_encoder
def load(char_model, observations, generated_posts): logging.info('Begin transform') # Export observations observations.to_csv( path_or_buf=lib.get_conf('generated_observations_path'), index=False) # Export generated posts generated_posts.to_csv(path_or_buf=lib.get_conf('generated_posts_path'), index=False) logging.info('End load') lib.archive_dataset_schemas('generate_load', locals(), globals()) pass
def extract(): """ - Extract data from CSV :return: """ logging.info('Begin extract') # Read files from CSV observations = pandas.read_csv('../data/input/titanic.csv') # Archive & return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def model(x_train, x_test, y_train, y_test): """ - Train multiple models, return a trained model :param x_train: :param x_test: :param y_train: :param y_test: :return: """ logging.info('Begin model') # Baseline model baseline_model = models.baseline() baseline_model.fit( x_train, y_train, epochs=20, validation_split=.3, callbacks=[ TensorBoard(log_dir=os.path.expanduser('~/.logs/baseline')) ]) # Small model intermediate_model = models.small() intermediate_model.fit( x_train, y_train, epochs=20, validation_split=.3, callbacks=[TensorBoard(log_dir=os.path.expanduser('~/.logs/small'))]) # Large large_model = models.large() large_model.fit( x_train, y_train, epochs=20, validation_split=.3, callbacks=[TensorBoard(log_dir=os.path.expanduser('~/.logs/large'))]) # Archive & return lib.archive_dataset_schemas('model', locals(), globals()) logging.info('End model') return large_model
def transform(observations): """ - Convert Sex to boolean male indicator - Create train / test split - Create SKLearn-Pandas mapper - Train SKLearn - Transform train and test data :param observations: :type observations: pandas.DataFrame :return: """ logging.info('Begin transform') # Convert Sex field into boolean male indicator observations['male'] = observations['Sex'] == 'male' logging.info('Converted Sex to binary class. Value counts: {}'.format( observations['male'].value_counts())) # Split into train / test split mask = numpy.random.rand(len(observations)) < 0.8 observations_train = observations[mask] observations_test = observations[~mask] logging.info('Creating dataframe mapper') mapper = DataFrameMapper([(['Age'], [Imputer(), StandardScaler()]), (['SibSp'], [Imputer(), StandardScaler()]), (['Parch'], [Imputer(), StandardScaler()]), (['male'], [Imputer(strategy='most_frequent')])]) logging.info('Fitting and transforming training data set') x_train = mapper.fit_transform(observations_train) y_train = observations_train['Survived'].values logging.info('Transforming response data set') x_test = mapper.transform(observations_test) y_test = observations_test['Survived'].values # Archive & return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return x_train, x_test, y_train, y_test, mapper
def model(observations, X, y, label_encoder): logging.info('Beginning model') # Data split, formatting dummy_X = observations[['lat', 'long']].as_matrix() dummy_y = observations['is_grand_larceny'] # ZeroR Model dummy_clf = DummyClassifier(strategy='constant', constant=1) dummy_clf.fit(dummy_X, dummy_y) print('Dummy modle accuracy: {}'.format(dummy_clf.score(dummy_X, dummy_y))) # Keras model X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) train_test_mask = numpy.random.random(size=len(observations.index)) num_train = sum(train_test_mask < .8) num_validate = sum(train_test_mask >= .8) logging.info( 'Proceeding w/ {} train observations, and {} test observations'.format( num_train, num_validate)) ff_model = models.gen_stupid_ff_network(X.shape[1], y.shape[1]) ff_model.fit(X_train, y_train, batch_size=1024, epochs=4, validation_data=(X_test, y_test)) # Add predictions to data set preds = ff_model.predict(X) ff_model.metrics, ff_model.metrics_names observations['max_probability'] = map(max, preds) observations['prediction_index'] = map(lambda x: numpy.argmax(x), preds) observations['modeling_prediction'] = map( lambda x: lib.prop_to_label(x, label_encoder), preds) trained_model = ff_model logging.info('End model') lib.archive_dataset_schemas('model', locals(), globals()) return observations, X, y, label_encoder, trained_model
def load(train, test, transformation_pipeline, trained_model): """ Load all assets for downstream use :param train: :param test: :param transformation_pipeline: :param trained_model: :return: """ logging.info('Begin load') # Serialize train train_path = os.path.join(lib.get_batch_output_folder(), 'train.csv') logging.info('Saving train to path: {}'.format(train_path)) train.to_csv(train_path, index=False) # Serialize test test_path = os.path.join(lib.get_batch_output_folder(), 'test.csv') logging.info('Saving test to path: {}'.format(train_path)) test.to_csv(test_path, index=False) # Serialize transformation_pipeline if transformation_pipeline is not None: transformation_pipeline_path = os.path.join( lib.get_batch_output_folder(), 'transformation_pipeline.pkl') logging.info('Saving transformation_pipeline to path: {}'.format( transformation_pipeline_path)) pickle.dump(transformation_pipeline, open(transformation_pipeline_path, 'wb')) # Serialize trained_model if trained_model is not None: # Serialize trained_model trained_model_path = os.path.join(lib.get_batch_output_folder(), 'trained_model.pkl') logging.info( 'Saving trained_model to path: {}'.format(trained_model_path)) pickle.dump(trained_model, open(trained_model_path, 'wb')) # Capture model results print(trained_model.cv_results_) lib.archive_dataset_schemas('load', locals(), globals()) logging.info('End load') pass
def model(train, test): """ Create a pipeline and train a grid searched model :param train: :param test: :return: """ logging.info('Begin model') mapper = DataFrameMapper([ ('honorific', [CountVectorizer(vocabulary=lib.HONORIFIC_VOCABULARY)]), (['pclass'], [Imputer(), StandardScaler()]), (['male'], [Imputer(), StandardScaler()]), (['siblings_spouses_aboard'], [Imputer(), StandardScaler()]), (['parents_children_aboard'], [Imputer(), StandardScaler()]), (['fare'], [Imputer(), StandardScaler()]), ]) transformation_pipeline = Pipeline([('featureizer', mapper), ('svc', SVC())]) param_grid = { 'svc__gamma': numpy.logspace(-9, 3, 1), 'svc__C': numpy.logspace(-2, 10, 1), 'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svc__degree': range(2, 8) } trained_model = GridSearchCV(transformation_pipeline, param_grid=param_grid, scoring='accuracy', cv=2, n_jobs=-1) logging.info('Training model') trained_model.fit(train.copy(), y=train['survived']) # Set prediction for data_set in [train, test]: data_set['pred'] = trained_model.predict(data_set) lib.archive_dataset_schemas('model', locals(), globals()) logging.info('End model') return train, test, transformation_pipeline, trained_model
def transform(observations): """ Perform light feature transformation, ahead of feature transformation pipeline :param observations: :return: """ logging.info('Begin transform') # Convert the gender column to a male or not column observations['male'] = observations['sex'] == 'male' # Get the honorific (e.g. `Mr.` from `,Mr. Henry Jr Sutehall`) observations['honorific'] = observations['name'].apply(lambda x: str(x).split()[0]) train, test = train_test_split(observations, test_size=0.2) lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return train, test
def load(observations, vectorizer, nb, test): logging.info('Begin load') logging.info('Writing observations to CSV') observations.to_csv(os.path.join(lib.get_batch_output_folder(), 'observations.csv')) logging.info('Writing test observations to CSV ') test.to_csv(os.path.join(lib.get_batch_output_folder(), 'test.csv')) logging.info('Writing vectorizer to file') cPickle.dump(vectorizer, open(os.path.join(lib.get_batch_output_folder(), 'vectorizer.pkl'), 'w+')) logging.info('Writing model to file') cPickle.dump(nb, open(os.path.join(lib.get_batch_output_folder(), 'model.pkl'), 'w+')) lib.archive_dataset_schemas('load', locals(), globals()) logging.info('End load') pass
def extract(): """ Extract the data set from upstream :return: """ logging.info('Begin extract') # Load the data set observations = lib.load_titanic() # Subset observation for speedier test iterations if lib.get_conf('test_run'): logging.warn('test_run is set to True. Subsetting to a much smaller data set for testing purposes.') observations = observations.sample(100) observations = observations.reset_index() lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def extract(): # TODO Docstring logging.info('Begin extract') # Extract all posts for given subreddit, going back given number of days logging.info('Downloading submissions from Reddit') observations = scrape_subreddit(lib.get_conf('subreddit'), lib.get_conf('history_num_days')) logging.info('Found {} submissions'.format(len(observations.index))) # Load embedding matrix resources.download_embedding() embedding_matrix, word_to_index = resources.create_embedding_matrix() logging.info('word_to_index max index: {}'.format( max(word_to_index.values()))) logging.info('End extract') lib.archive_dataset_schemas('extract', locals(), globals()) return embedding_matrix, word_to_index, observations
def transform(observations): logging.info('Begin transform') cat_vars = ['air_store_id'] cont_vars = ['reserve_visitors', 'visitors'] date_vars = ['visit_datetime', 'reserve_datetime'] # Convert datetime vars for date_var in date_vars: logging.info('Converting date_var: {}'.format(date_var)) observations[date_var] = pandas.to_datetime(observations[date_var], format='%Y-%m-%d %H:%M:%S') mapper = df_prep.create_mapper(observations, cat_vars=cat_vars, cont_vars=cont_vars, date_vars=date_vars) lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, mapper