def copy_plugin_to_dss_folder(plugin_id, folder_id, project_key, force_copy=False): """ Copy python-lib from a plugin to a managed folder """ root_path = dataiku.get_custom_variables( project_key=project_key)['dip.home'] # TODO change this to plugins/installed/... plugin_lib_path = os.path.join(root_path, 'plugins', 'installed', plugin_id, 'python-lib') folder_path = dataiku.Folder(folder_id, project_key=project_key).get_path() lib_folder_path = os.path.join(folder_path, 'python-lib') if os.path.isdir(lib_folder_path) and force_copy: shutil.rmtree(lib_folder_path) if not os.path.isdir(lib_folder_path): os.mkdir(lib_folder_path) sys.path.append(lib_folder_path) for item in os.listdir(plugin_lib_path): s = os.path.join(plugin_lib_path, item) d = os.path.join(lib_folder_path, item) if os.path.isdir(s): shutil.copytree(s, d, symlinks=False, ignore=None) else: shutil.copy2(s, d) else: logger.info('python-lib already exists in folder')
def write_outputs( result_dataset, df_values, metadata_dataset, df_metadata, geometry_dataset, df_geometry_result, log_api_dataset, df_api_log, P_ACTIVATE_BACKUP, backup_basename, P_OPTION_DATA_AS_TRANSACTIONS,date ): # UGLY Temporary if P_ACTIVATE_BACKUP is True: backup_path = dataiku.get_custom_variables()["dip.home"] + '/tmp/' filename = 'dataiku_plugin_esri_' + backup_basename + '_data_backup_' + date + '.csv' f = backup_path + filename print 'Exporting backup of your data with (key,value) format: %s' % (P_OPTION_DATA_AS_TRANSACTIONS) df_values.to_csv(f,sep='|',index='none') print 'Backup stored into: %s ' % (f) result_dataset.write_with_schema(df_values) if metadata_dataset is not None and df_metadata.shape[0] > 0: print "Writing metdata: %s" % df_metadata df_metadata = df_metadata.reset_index() df_metadata = df_metadata.drop('index',axis=1) df_metadata = df_metadata.drop_duplicates(take_last=True) metadata_dataset.write_with_schema(df_metadata) if geometry_dataset is not None: geometry_dataset.write_with_schema(df_geometry_result) if log_api_dataset is not None: log_api_dataset.write_with_schema(df_api_log)
def save_model_diagnostics(table_name, model_diagnostics): """ Save all model diagnostics in a dataframe Parameters ---------- class_labels : Class labels for model model_diagnostics : dictionary into which diagnostics in training process will be inserted Returns ------- model_diagnostics : dictionary into which diagnostics have been inserted """ diagnostics_df = pd.DataFrame(model_diagnostics).transpose() diagnostics_df['model_training_date'] = datetime.today().strftime( '%Y-%m-%d %H:%M') diagnostics_df['calculation_date'] = dataiku.get_custom_variables( )['calculation_date'] diagnostics_df['model'] = diagnostics_df.index cols = [ 'model', 'calculation_date', 'model_training_date', 'train_actuals_start_date', 'train_actuals_end_date', 'training_time_range_start_date', 'train_time_range_end_date', 'training_samples_available', 'total_train_percentage_target_class', 'total_train_samples_target_class', 'total_train_samples_used', 'oob_score', 'model_parameter', 'feature_importance' ] diagnostics_df = diagnostics_df[cols] train_model_diagnostics = dataiku.Dataset(table_name) train_model_diagnostics.write_with_schema(diagnostics_df)
def save_model_diagnostics(current_tm_list, target_month_list, model_diagnostics, X_train_dict, rfc_models, output_table_name): for i, current_tm in enumerate(current_tm_list): for k, target_month in enumerate(target_month_list): if k >= i: feature_importance_dict = dict( zip( X_train_dict['tm_{}_target_{}'.format( current_tm, target_month)].columns, rfc_models['tm_{}_target_{}'.format( current_tm, target_month)].feature_importances_)) model_diagnostics['tm_{}_target_{}'.format( current_tm, target_month )]['feature_importance'] = feature_importance_dict diagnostics_df = pd.DataFrame(model_diagnostics).transpose() diagnostics_df['model_training_date'] = dt.today().strftime( '%Y-%m-%d %H:%M') diagnostics_df['model'] = diagnostics_df.index diagnostics_df['training_calculation_date'] = dataiku.get_custom_variables( )['training_calculation_date'] cols = [ 'model', 'model_training_date', 'training_calculation_date', 'train_start_date', 'train_end_date', 'training_samples_available', 'total_train_percentage_target_class', 'total_train_samples_target_class', 'total_train_samples_used', 'oob_score', 'model_parameter', 'feature_importance' ] diagnostics_df = diagnostics_df[cols] train_model_diagnostics = dataiku.Dataset(output_table_name) train_model_diagnostics.write_with_schema(diagnostics_df)
def write_outputs(result_dataset, df_values, metadata_dataset, df_metadata, geometry_dataset, df_geometry_result, log_api_dataset, df_api_log, P_ACTIVATE_BACKUP, backup_basename, P_OPTION_DATA_AS_TRANSACTIONS, date): # UGLY Temporary if P_ACTIVATE_BACKUP is True: backup_path = dataiku.get_custom_variables()["dip.home"] + '/tmp/' filename = 'dataiku_plugin_esri_' + backup_basename + '_data_backup_' + date + '.csv' f = backup_path + filename print 'Exporting backup of your data with (key,value) format: %s' % ( P_OPTION_DATA_AS_TRANSACTIONS) df_values.to_csv(f, sep='|', index='none') print 'Backup stored into: %s ' % (f) result_dataset.write_with_schema(df_values) if metadata_dataset is not None and df_metadata.shape[0] > 0: print "Writing metdata: %s" % df_metadata df_metadata = df_metadata.reset_index() df_metadata = df_metadata.drop('index', axis=1) df_metadata = df_metadata.drop_duplicates(take_last=True) metadata_dataset.write_with_schema(df_metadata) if geometry_dataset is not None: geometry_dataset.write_with_schema(df_geometry_result) if log_api_dataset is not None: log_api_dataset.write_with_schema(df_api_log)
def __get_logs_path(s=None): custom_variables = dataiku.get_custom_variables() logs_dir = osp.abspath( osp.join(custom_variables['dip.home'], "analysis-data", custom_variables['projectKey'], custom_variables['analysisId'], custom_variables['taskId'], "sessions", s if s else custom_variables['sessionId'], 'pp1', 'm1', 'tensorboard_logs')) return logs_dir
def create_target(row): revenue = row['revenue'] v = int(dataiku.get_custom_variables()["revenue_value"]) if revenue >= v: target = 1 elif revenue < v: target = 0 else: target = revenue return target
def get_regr_models(df_out, current_tm_list): regr_max_date = (parser.parse( dataiku.get_custom_variables()['training_calculation_date']) + relativedelta(months=-11)).strftime('%Y-%m-%d') regr_df = df_out[df_out['access_start_date'] <= ( dataiku.get_custom_variables()['training_calculation_date'])] X_regr_dict = {} Y_regr_dict = {} for i in current_tm_list: current_tm_df = regr_df[regr_df['tenure_month'] == i - 1] Y_regr_dict['tm_{}'.format(i)] = current_tm_df['tenure_length_capped'] X_regr_dict['tm_{}'.format(i)] = current_tm_df.copy() X_regr_dict['tm_{}'.format(i)] = X_regr_dict['tm_{}'.format(i)].drop( class_labels, axis=1) X_regr_dict['tm_{}'.format(i)] = X_regr_dict['tm_{}'.format(i)].drop( ['access_start_date', 'tenure_length_capped', 'is_churn'], axis=1) xgb_model_regr_dict = {} xgb_model_regr_labels = X_regr_dict['tm_3'].columns.values y_labels_dict = Y_regr_dict.copy() for i, current_tm in enumerate(X_regr_dict.keys()): start = dt.now() print("training xgb regressor for users in current month {}".format( current_tm)) clf = xgb.XGBRegressor( objective='reg:squarederror', learning_rate=0.1, max_depth=7, gamma=0, reg_lambda=1, n_estimators=25, ) clf = clf.fit(X_regr_dict[current_tm], y_labels_dict[current_tm]) xgb_model_regr_dict[current_tm] = clf print('training took: {}'.format(dt.now() - start)) return xgb_model_regr_dict, xgb_model_regr_labels
def calculate_average_docomo_tenure_length(df_docomo, synchronization_time_days): one_year_ago = parser.parse(dataiku.get_custom_variables( )['training_calculation_date']) + relativedelta( months=-12, days=-synchronization_time_days) two_years_ago = parser.parse(dataiku.get_custom_variables( )['training_calculation_date']) + relativedelta( months=-24, days=-synchronization_time_days) df_docomo_time_filter_max = df_docomo['access_start_date'] < one_year_ago df_docomo_time_filter_min = df_docomo['access_start_date'] > two_years_ago df_docomo_filtered = df_docomo[df_docomo_time_filter_max & df_docomo_time_filter_min].copy() if df_docomo_filtered.size > 0: docomo_sleeping_babies_average_tenure_length = df_docomo_filtered[ 'tenure_length_capped'].mean() else: docomo_sleeping_babies_average_tenure_length = 12.0 return docomo_sleeping_babies_average_tenure_length
def add_prediction_identifiers(df_rfc_results_dict, test_dict): identifier_columns = ['cust_account_id','cust_territory', 'cust_country','access_start_date'] prediction_columns = ['tenure_months_completed', 'current_tenure_month', 'prediction', 'pred_proba_target_is_3M','pred_proba_target_is_4M','pred_proba_target_is_5M','pred_proba_target_is_6M','pred_proba_target_is_7M','pred_proba_target_is_8M','pred_proba_target_is_9M','pred_proba_target_is_10M','pred_proba_target_is_11M','pred_proba_target_is_12M_plus'] for current_month in df_rfc_results_dict: df_rfc_results_dict[current_month] = df_rfc_results_dict[current_month].join(test_dict[current_month])[identifier_columns + prediction_columns] prediction_date = dataiku.get_custom_variables()['prediction_date'] df_rfc_results_dict[current_month]['prediction_date'] = datetime_object = dt.strptime(prediction_date, '%Y-%m-%d') df_rfc_results_dict[current_month]['inserted_at'] = dt.today() return df_rfc_results_dict
def get_soup(link, headerName=None, params=None, verify=None): """ The deaders have to be defined as a custom variables at the project level. """ if headerName: headers = json.loads(dk.get_custom_variables()[headerName]) else: headers = None if not verify: r = rq.get(link, headers=headers, params=params) else: r = rq.get(link, headers=headers, params=params, verify=verify) soup = Soup(r.text, 'html.parser') return soup
def add_pred_date_and_inserted_at_to_pred_df(df_ect_12m): """ Add date of prediction and date inserted into the table into the prediction DataFrame Parameters ---------- df_preds : DataFrame with predictions Returns ------- df_ect_12m : DataFrame with both predictions and joined customer information """ prediction_date = dataiku.get_custom_variables()['calculation_date'] df_ect_12m['prediction_date'] = parse(prediction_date) df_ect_12m['inserted_at'] = dt.datetime.today()
def run(self, progress_callback): """ Do stuff here. Can return a string or raise an exception. The progress_callback is a function expecting 1 value: current progress """ params = get_params(self.config, self.client, self.project) root_path = dataiku.get_custom_variables(project_key=self.project_key)['dip.home'] copy_plugin_to_dss_folder(self.plugin_id, root_path, params.get( "model_folder_id"), self.project_key, force_copy=True) create_api_code_env(self.plugin_id, root_path, self.client, params.get( 'code_env_name'), params.get('use_gpu')) api_service = get_api_service(params, self.project) endpoint_settings = get_model_endpoint_settings(params) create_python_endpoint(api_service, endpoint_settings) html_str = get_html_result(params) return html_str
def customize_tb_page(tb_page): custom_variables = dataiku.get_custom_variables() keep_alive_script = "window.parent.angular && setInterval(()=>{window.parent.angular.element(window.parent.document.body).injector(\"dataiku\").get(\"Notification\").publishToBackend(\"timeoutable-task-keepalive\",{\"projectKey\":\"" + \ custom_variables["projectKey"] + "\", \"taskId\":\"" + custom_variables[ "webappId"] + "\"});},1000*30)" new_tb_page = tb_page \ .replace('#f57c00', '#2aaf5d') \ .replace('#2aaf5d', '#55707D') \ .replace('#ff7043', '#2aaf5d') \ .replace('#ff9800', '#2aaf5d').replace('#FFB74D', '#2aaf5d') new_tb_page = add_to_page( new_tb_page, '<script>' + str(keep_alive_script) + '</script>') new_tb_page = add_to_page( new_tb_page, "<style>.sidebar.tf-scalar-dashboard {min-width: 200px;}.tf-runs-selector-0{display: none;}.tf-tensorboard-0 #toolbar.tf-tensorboard {background-color: #5f7d8c}</style>" ) return new_tb_page
def get_docomo_predictions(current_tm_list, rfc_models, df_docomo_test): docomo_avg_per_current_tm_dict = {} for i in current_tm_list: if 'docomo_tm_{}'.format(i) in rfc_models.keys(): docomo_avg_per_current_tm_dict[i-1] = rfc_models['docomo_tm_{}'.format(i)] else: rfc_models['docomo_tm_{}'.format(i)] = 12.0 df_docomo_w_prediction = df_docomo_test.copy() df_docomo_w_prediction['prediction'] = df_docomo_w_prediction['tenure_month'].map(docomo_avg_per_current_tm_dict) identifier_columns = ['cust_account_id','cust_territory', 'cust_country','access_start_date'] prediction_columns = ['tenure_months_completed', 'current_tenure_month', 'prediction'] prediction_date = dataiku.get_custom_variables()['prediction_date'] df_docomo_w_prediction['tenure_months_completed'] = df_docomo_w_prediction['tenure_month'] df_docomo_w_prediction['current_tenure_month'] = df_docomo_w_prediction['tenure_month']+1 df_docomo_w_prediction = df_docomo_w_prediction[identifier_columns + prediction_columns] df_docomo_w_prediction['prediction_date'] = prediction_date df_docomo_w_prediction['inserted_at'] = dt.today().strftime('%Y-%m-%d %H:%M') return df_docomo_w_prediction
# coding: utf-8 # In[1]: import dataiku import json #print(json.dumps(job,indent=4)) client = dataiku.api_client() current_project = client.get_project( dataiku.get_custom_variables()["projectKey"]) variables = current_project.get_variables() general_settings = client.get_general_settings() # In[2]: all_datasets = current_project.list_datasets() # In[6]: for dataset_list_item in all_datasets: dataset = current_project.get_dataset(dataset_list_item["name"]) dataset_settings = dataset.get_settings() if dataset_settings.get_raw()['type'] == 'Snowflake': print('===') print('original %s schema' % dataset_list_item["name"]) print(json.dumps(dataset_settings.schema_columns, indent=2)) for index in range(len(dataset_list_item['schema']['columns'])): if dataset_settings.schema_columns[index]['type'] == 'bigint': dataset_settings.schema_columns[index]['type'] = 'int' dataset_settings.save()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE_MAGIC_CELL # Automatically replaced inline charts by "no-op" charts # %pylab inline import matplotlib matplotlib.use("Agg") # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE import dataiku from dataiku import pandasutils as pdu import pandas as pd import requests import json # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE api_key = dataiku.get_custom_variables( project_key='PORTFOLIOANALYSIS').get('api_alpha') # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE url = 'https://www.alphavantage.co/query' def get_daily_ts(symbol): data = { "function": "TIME_SERIES_DAILY_ADJUSTED", "symbol": symbol, "apikey": api_key, "outputsize": "compact" } r = requests.get(url, params=data)
import dataiku import pandas as pd import json import dataiku from dataiku.scenario import Scenario SCENARIO_ID = 'RECURSIVEBUILDOFTIMESERIES' PROJECT_KEY = dataiku.get_custom_variables()["projectKey"] client = dataiku.api_client() p = client.get_project(PROJECT_KEY) @app.route('/update/<path:params>') def update(params): params = json.loads(params) variables = p.get_variables() variables["standard"] = params p.set_variables(variables) sc = p.get_scenario(SCENARIO_ID) try: sc.run_and_wait() jobSucceed = True except: jobSucceed = False return json.dumps({"status": str(jobSucceed)})
# coding: utf-8 # In[1]: import dataiku import pandas as pd # import xlsxwriter client = dataiku.api_client() project = client.get_project(dataiku.get_custom_variables()["projectKey"]) datasets = project.list_datasets() # tag_name = 'sql_dataset' result_dict = {'dataset': [], 'tags': []} for index in range(len(datasets)): if datasets[index]['tags']: # if tag_name in datasets[index]['tags']: # result_dict = {'dataset':[],'tag':[]} result_dict['dataset'].append(datasets[index]['name']) result_dict['tags'].append(datasets[index]['tags']) df = pd.DataFrame(data=result_dict) # df.to_excel('output1.xlsx', engine='xlsxwriter') df.to_excel('output1.xlsx') # print "dataset '{}' is tagged with '{}'".format(datasets[index]['name'],tag_name) # if datasets[index]['tags']=='sql_dataset': # if tag == 'sql_dataset': # print (datasets[index]['tags']) # In[ ]:
def get_x_and_y_train(current_tm_list, target_month_list, df_features_train, synchronization_time_days, model_diagnostics): # Create empty dict for X and Y train sets per model X_train_dict = {} Y_train_dict = {} # Bring in the training calculation dates from the DSS project variables train_date = dataiku.get_custom_variables()['training_calculation_date'] training_start_date = dataiku.get_custom_variables()['training_start_date'] # For each current tenure month for i, current_tm in enumerate(current_tm_list): # For each target month per current tenure month for k, target_month in enumerate(target_month_list): # As long as the target month is greater or equal to the current month if k >= i: target_month_num = 2 + k print( 'building training set for current tm {} and target month {}' .format(current_tm, target_month)) current_tm_df = df_features_train[ df_features_train['tenure_month'] == current_tm - 1] # create end and start period for training training_end_date = parser.parse(train_date) + relativedelta( months=-target_month_num, days=-synchronization_time_days) # convert datetime to string training_start_date_str = training_start_date training_end_date_str = training_end_date.strftime('%Y-%m-%d') # Create mask filters based on the calculation date and apsply using the access start date field start_mask = current_tm_df[ 'access_start_date'] >= training_start_date_str end_mask = current_tm_df[ 'access_start_date'] <= training_end_date_str current_tm_df = current_tm_df.loc[start_mask & end_mask] # Define the class labels to drop print("initial train sample size : " + str(current_tm_df.shape[0])) class_labels_to_drop = list(target_month_list) class_labels_to_drop.remove(target_month) print("class_labels_to_drop: {}".format(class_labels_to_drop)) # Downsample to have a balanced split of the target class_weight = current_tm_df[target_month].sum( ) / 1.0 / current_tm_df.shape[0] print("class weight: " + str(class_weight)) sample_size = current_tm_df[target_month].sum() df_features_train_sampled = current_tm_df.groupby( target_month, group_keys=False).apply( lambda group: group.sample(sample_size, replace=True)) # Output to console total_train = df_features_train_sampled.shape[0] percentage_target_train = df_features_train_sampled[ target_month].sum() / float(total_train) print("downsampled train sample size : " + str(total_train)) print("% target class : " + str(percentage_target_train)) print( 'tm_{}_target_{}'.format(current_tm, target_month) + ': train_samples: ' + str(df_features_train_sampled.shape[0]) + '; start_time:', df_features_train_sampled['access_start_date'].min( ).strftime('%Y-%m-%d'), '; end_time:', df_features_train_sampled['access_start_date'].max( ).strftime('%Y-%m-%d')) # Output to the model diagnostics dict model_diagnostics['tm_{}_target_{}'.format( current_tm, target_month )]['training_samples_available'] = df_features_train_sampled.shape[ 0] model_diagnostics['tm_{}_target_{}'.format( current_tm, target_month )]['train_start_date'] = df_features_train_sampled[ 'access_start_date'].min().strftime('%Y-%m-%d') model_diagnostics['tm_{}_target_{}'.format( current_tm, target_month )]['train_end_date'] = df_features_train_sampled[ 'access_start_date'].max().strftime('%Y-%m-%d') # Create train X and Y X_train = df_features_train_sampled.copy() X_train = X_train.drop(class_labels_to_drop, axis=1) X_train = X_train.drop(target_month, axis=1) X_train = X_train.drop( ["access_start_date", "tenure_length_capped", "is_churn"], axis=1) Y_train = df_features_train_sampled.loc[:, [target_month ]].values.ravel() # Add the X and Y train sets to the dict for all models X_train_dict['tm_{}_target_{}'.format(current_tm, target_month)] = X_train Y_train_dict['tm_{}_target_{}'.format(current_tm, target_month)] = Y_train return X_train_dict, Y_train_dict, model_diagnostics
#import enchant import spacy from nltk.sentiment.vader import SentimentIntensityAnalyzer from textblob import TextBlob import dataiku import ast prod_pronouns = dataiku.get_custom_variables(typed=True)['prod_pronouns'] def apply_extraction(row, nlp, sid, text_column, review_id, product_id): review_body = row[text_column] review_id = row[review_id] # review_marketplace = row['marketplace'] # customer_id = row['customer_id'] product_id = row[product_id] # product_parent = row['product_parent'] # product_title = row['product_title'] # product_category = row['product_category'] # date = str(row['review_date']) # star_rating = row['star_rating'] # url = add_amazonlink(product_id) doc = nlp(review_body) ner_heads = {ent.root.idx: ent for ent in doc.ents} ## FIRST RULE OF DEPENDANCY PARSE - ## M - Sentiment modifier || A - Aspect ## RULE = M is child of A with a relationshio of amod rule1_pairs = [] for token in doc:
from aspect_clustering.get_word_vectors import get_word_vectors import dataiku from sklearn import cluster import ast n_clusters = dataiku.get_custom_variables(typed=True)['NUM_CLUSTERS'] # print(n_clusters) n_clusters = ast.literal_eval(n_clusters) def get_word_clusters(unique_aspects, nlp): print("Found {} unique aspects for this product".format( len(unique_aspects))) asp_vectors = get_word_vectors(unique_aspects, nlp) # gets the word vector for each noun print("len(unique_aspects)", len(unique_aspects)) print("n_clusters", n_clusters) if len(unique_aspects) < n_clusters: print("Too few aspects ({}) found. No clustering required...".format( len(unique_aspects))) return list(range(len(unique_aspects))) print("Running k-means clustering...") kmeans = cluster.KMeans(n_clusters=n_clusters) kmeans.fit(asp_vectors) labels = kmeans.labels_ return labels
### Output gen P_FEATURE_SELECTION_NB_FIELD_PER_OUTPUT = int( get_recipe_config()['param_nb_field_per_output']) #200 P_GEN_UNIQUE_FILE = get_recipe_config()[ 'param_output_one_file_all_states'] #True P_GENERATE_ALL_THE_CENSUS_LEVEL = get_recipe_config()[ 'param_output_only_matching_census_level'] #False ## Re us DL content P_USE_PREVIOUS_SOURCES = get_recipe_config( )['param_re_use_collected_census_sources'] P_DELETE_US_CENSUS_SOURCES = get_recipe_config()['param_delete_census_sources'] #------------------------------------------- END SETTINGS path_datadir_tmp = dataiku.get_custom_variables()["dip.home"] + '/tmp/' FOLDER_NAME = 'tmp_census_us_' + P_CENSUS_CONTENT input_ = get_input_names_for_role('input')[0] output_folder = dataiku.Folder(get_output_names_for_role('censusdata')[0]) path_ = output_folder.get_path() + '/' print 'Checking if previous files exist...' if len(os.listdir(path_)) > 0: for fz in os.listdir(path_): cmd = "rm %s" % (path_ + fz) print 'removing: %s' % (fz) os.system(cmd) dict_states = common.get_state_structure(P_STATES_TYPE_NAME)
def get_train_data(df_features, synchronization_time_days, model_diagnostics, class_labels, last_model_number=12): """ Get train data for each model (most recent available 12 months) from all training data Parameters ---------- df_features : DataFrame consisting of all data required for training (dummies, numeric features) synchronization_time_days : We know that data is not directly coming through; being very conservative, it is set at 30 days model_diagnostics : dictionary into which diagnostics in training process will be inserted Returns ------- all_class_labels : list with created class labels """ train_date = dataiku.get_custom_variables()['calculation_date'] df_features_train = {} for k, i in enumerate(class_labels): target_month = last_model_number - k # create end and start period for training training_end_date = parser.parse(train_date) + relativedelta( months=-target_month, days=-synchronization_time_days) training_start_date = parser.parse(train_date) + relativedelta( years=-1, months=-target_month, days=-synchronization_time_days) # convert datetime to string training_start_date_str = training_start_date.strftime('%Y-%m-%d') training_end_date_str = training_end_date.strftime('%Y-%m-%d') start_mask = df_features['access_start_date'] >= training_start_date_str end_mask = df_features['access_start_date'] <= training_end_date_str status_unknown = (df_features['tenure_length_capped'] == target_month) & (df_features['is_churn'] == 0) df_features_train_sampled = df_features.loc[start_mask & end_mask & ~status_unknown] df_features_train[i] = df_features_train_sampled model_diagnostics[i][ 'training_samples_available'] = df_features_train_sampled.shape[0] model_diagnostics[i][ 'train_actuals_start_date'] = df_features_train_sampled[ 'access_start_date'].min().strftime('%Y-%m-%d') model_diagnostics[i][ 'train_actuals_end_date'] = df_features_train_sampled[ 'access_start_date'].max().strftime('%Y-%m-%d') model_diagnostics[i][ 'training_time_range_start_date'] = training_start_date model_diagnostics[i]['train_time_range_end_date'] = training_end_date print( i + ': train_samples: ' + str(df_features_train_sampled.shape[0]) + '; start_time:', df_features_train_sampled['access_start_date'].min().strftime( '%Y-%m-%d'), '; end_time:', df_features_train_sampled['access_start_date'].max().strftime( '%Y-%m-%d')) return df_features_train, model_diagnostics
settings_TEST = dataiku.api_client().get_project('SOME_ETL_BASICS').get_settings() # all_datasets = current_project.list_datasets() settings_TEST.get_raw()['exposedObjects']['objects'][0] # In[ ]: settings = current_project_1.get_settings() exposed_objects = settings.get_raw()['exposedObjects']['objects'] # print json.dumps(exposed_objects,indent=2) if len(exposed_objects): print('not 0') print('=== {} exposed objects ==='.format(dataiku.get_custom_variables()["projectKey"]) # for index in range(len(exposed_objects)): # ds_nm = exposed_objects[index]['localName'] # trg_prj = exposed_objects[index]['rules'] # # print json.dumps(exposed_objects[index],indent=2) # trg_prj_names = [] # for index in range(len(trg_prj)): # trg_prj_names.append(trg_prj[index]['targetProject']) # print("Dataset %s is exposed in %s project(s)" % (ds_nm,', '.join(trg_prj_names))) # print('\n') # In[ ]: exposed_objects
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Read recipe inputs images = dataiku.Folder("HNEvJqgm") images_info = images.get_info() images_folder_path = images.get_path() # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Write recipe outputs pdfs_processed = dataiku.Folder("w3DdXIhY") pdfs_processed_info = pdfs_processed.get_info() pdfs_folder_path = pdfs_processed.get_path() # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # If project variable is set to true, all outputs cleared and PDF processing is re-run for all input PDFs. if dataiku.get_custom_variables()["reprocess_PDFs"].lower() == "true": for output_pdf_folder in os.listdir(pdfs_folder_path): output_pdf_folder_path = os.path.join(pdfs_folder_path, output_pdf_folder) for page in os.listdir(output_pdf_folder_path): page_path = os.path.join(output_pdf_folder_path, page) os.remove(page_path) os.rmdir(output_pdf_folder_path) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE for pdf_folder in os.listdir(images_folder_path): pdf_folder_path = os.path.join(images_folder_path, pdf_folder) PDF_pages = os.listdir(os.path.join(images_folder_path, pdf_folder)) PDF_pages_path = [ os.path.join(pdf_folder_path, page) for page in os.listdir(pdf_folder_path)
folder_path = dataiku.Folder("managed_ds").get_path() managed_folder_name = "managed_ds" today = dt.datetime.now().date() # In[23]: for file in dataiku.Folder(managed_folder_name).list_paths_in_partition(): # print(folder_path+file) if re.match(r"/my_file*", file): # print (file) filetime = dt.datetime.fromtimestamp(os.path.getctime(folder_path+file)) if filetime.date() == today: # print('today') current_project = dataiku.api_client().get_project(dataiku.get_custom_variables()["projectKey"]) variables = current_project.get_variables() variables['standard']['file_uploaded_today'] = 'yes' current_project.set_variables(variables) # In[26]: # from dataiku.scenario import Trigger # t = Trigger() import dataiku import os import re import datetime as dt
import os import dataiku import json import pandas as pd, numpy as np import socket from dataikuapi.dssclient import DSSClient from dataiku.customrecipe import * import subprocess input_folder = dataiku.Folder(get_input_names_for_role('folder_to_parse')[0]) project_name = dataiku.get_custom_variables()["projectKey"] dip_home = dataiku.get_custom_variables()["dip.home"] port_file = dip_home + '/bin/env-default.sh' #Let's create an api_key if it does not exist def retrieve_api_key(): api_keys_dir = '/config/public-apikeys.json' with open(dip_home+api_keys_dir) as data_file: data_ = json.load(data_file) existing = False api_key = None for key in data_: try : if (key['createdBy'] == 'CLI' and key['globalAdmin'] == True and key['label'] == 'Added by dku command-line'): existing = False api_key = key['key'] break except KeyError: pass return [existing, api_key]
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE s_ave = df.groupby([u'場所'])[u'年初からの日数'].expanding().mean().values # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE df[u'年初からの日数_暦年平均']=s_ave # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE df = df.sort_values([u'場所', u'年月日_parsed'], ascending=False).reset_index(drop=True) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE df.head() """ # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE length = dataiku.get_custom_variables(typed=True)["window_size"] # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE var_to_vectorize = [u'平均気温', u'最高気温', u'最低気温', u'降水量合計', u'日照時間'] # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE new_df = pd.DataFrame() for city in df[u'場所'].unique(): print(city) df_sub = df.loc[df[u'場所']==city].reset_index(drop=True) for var in var_to_vectorize: print(var) colname = var + u"推移" df_sub[colname] = np.nan long_vec = df_sub[var].values.tolist()