def copy_plugin_to_dss_folder(plugin_id,
                              folder_id,
                              project_key,
                              force_copy=False):
    """
    Copy python-lib from a plugin to a managed folder
    """

    root_path = dataiku.get_custom_variables(
        project_key=project_key)['dip.home']
    # TODO change this to plugins/installed/...
    plugin_lib_path = os.path.join(root_path, 'plugins', 'installed',
                                   plugin_id, 'python-lib')

    folder_path = dataiku.Folder(folder_id, project_key=project_key).get_path()
    lib_folder_path = os.path.join(folder_path, 'python-lib')

    if os.path.isdir(lib_folder_path) and force_copy:
        shutil.rmtree(lib_folder_path)

    if not os.path.isdir(lib_folder_path):
        os.mkdir(lib_folder_path)
        sys.path.append(lib_folder_path)

        for item in os.listdir(plugin_lib_path):
            s = os.path.join(plugin_lib_path, item)
            d = os.path.join(lib_folder_path, item)
            if os.path.isdir(s):
                shutil.copytree(s, d, symlinks=False, ignore=None)
            else:
                shutil.copy2(s, d)
    else:
        logger.info('python-lib already exists in folder')
示例#2
0
def write_outputs(
    result_dataset, df_values,
    metadata_dataset, df_metadata,
    geometry_dataset, df_geometry_result,
    log_api_dataset, df_api_log,
    P_ACTIVATE_BACKUP, backup_basename,
    P_OPTION_DATA_AS_TRANSACTIONS,date
    ):
    # UGLY Temporary
    if P_ACTIVATE_BACKUP is True:
        backup_path = dataiku.get_custom_variables()["dip.home"] + '/tmp/'
        filename = 'dataiku_plugin_esri_' + backup_basename + '_data_backup_' + date + '.csv' 
        f = backup_path + filename
        print 'Exporting backup of your data with (key,value) format: %s' % (P_OPTION_DATA_AS_TRANSACTIONS) 
        df_values.to_csv(f,sep='|',index='none')
        print 'Backup stored into: %s ' % (f)

    result_dataset.write_with_schema(df_values)

    if metadata_dataset is not None and df_metadata.shape[0] > 0:
        print "Writing metdata: %s" % df_metadata
        df_metadata = df_metadata.reset_index()
        df_metadata = df_metadata.drop('index',axis=1)
        df_metadata = df_metadata.drop_duplicates(take_last=True)
        metadata_dataset.write_with_schema(df_metadata)

    if geometry_dataset is not None:
        geometry_dataset.write_with_schema(df_geometry_result)

    if log_api_dataset is not None:
        log_api_dataset.write_with_schema(df_api_log)
示例#3
0
def save_model_diagnostics(table_name, model_diagnostics):
    """
    Save all model diagnostics in a dataframe

    Parameters
    ----------
    class_labels : Class labels for model
    model_diagnostics : dictionary into which diagnostics in training process will be inserted

    Returns
    -------
    model_diagnostics : dictionary into which diagnostics have been inserted
    """

    diagnostics_df = pd.DataFrame(model_diagnostics).transpose()
    diagnostics_df['model_training_date'] = datetime.today().strftime(
        '%Y-%m-%d %H:%M')
    diagnostics_df['calculation_date'] = dataiku.get_custom_variables(
    )['calculation_date']

    diagnostics_df['model'] = diagnostics_df.index

    cols = [
        'model', 'calculation_date', 'model_training_date',
        'train_actuals_start_date', 'train_actuals_end_date',
        'training_time_range_start_date', 'train_time_range_end_date',
        'training_samples_available', 'total_train_percentage_target_class',
        'total_train_samples_target_class', 'total_train_samples_used',
        'oob_score', 'model_parameter', 'feature_importance'
    ]
    diagnostics_df = diagnostics_df[cols]

    train_model_diagnostics = dataiku.Dataset(table_name)
    train_model_diagnostics.write_with_schema(diagnostics_df)
示例#4
0
def save_model_diagnostics(current_tm_list, target_month_list,
                           model_diagnostics, X_train_dict, rfc_models,
                           output_table_name):

    for i, current_tm in enumerate(current_tm_list):
        for k, target_month in enumerate(target_month_list):
            if k >= i:
                feature_importance_dict = dict(
                    zip(
                        X_train_dict['tm_{}_target_{}'.format(
                            current_tm, target_month)].columns,
                        rfc_models['tm_{}_target_{}'.format(
                            current_tm, target_month)].feature_importances_))
                model_diagnostics['tm_{}_target_{}'.format(
                    current_tm, target_month
                )]['feature_importance'] = feature_importance_dict

    diagnostics_df = pd.DataFrame(model_diagnostics).transpose()
    diagnostics_df['model_training_date'] = dt.today().strftime(
        '%Y-%m-%d %H:%M')
    diagnostics_df['model'] = diagnostics_df.index
    diagnostics_df['training_calculation_date'] = dataiku.get_custom_variables(
    )['training_calculation_date']

    cols = [
        'model', 'model_training_date', 'training_calculation_date',
        'train_start_date', 'train_end_date', 'training_samples_available',
        'total_train_percentage_target_class',
        'total_train_samples_target_class', 'total_train_samples_used',
        'oob_score', 'model_parameter', 'feature_importance'
    ]
    diagnostics_df = diagnostics_df[cols]

    train_model_diagnostics = dataiku.Dataset(output_table_name)
    train_model_diagnostics.write_with_schema(diagnostics_df)
示例#5
0
def write_outputs(result_dataset, df_values, metadata_dataset, df_metadata,
                  geometry_dataset, df_geometry_result, log_api_dataset,
                  df_api_log, P_ACTIVATE_BACKUP, backup_basename,
                  P_OPTION_DATA_AS_TRANSACTIONS, date):
    # UGLY Temporary
    if P_ACTIVATE_BACKUP is True:
        backup_path = dataiku.get_custom_variables()["dip.home"] + '/tmp/'
        filename = 'dataiku_plugin_esri_' + backup_basename + '_data_backup_' + date + '.csv'
        f = backup_path + filename
        print 'Exporting backup of your data with (key,value) format: %s' % (
            P_OPTION_DATA_AS_TRANSACTIONS)
        df_values.to_csv(f, sep='|', index='none')
        print 'Backup stored into: %s ' % (f)

    result_dataset.write_with_schema(df_values)

    if metadata_dataset is not None and df_metadata.shape[0] > 0:
        print "Writing metdata: %s" % df_metadata
        df_metadata = df_metadata.reset_index()
        df_metadata = df_metadata.drop('index', axis=1)
        df_metadata = df_metadata.drop_duplicates(take_last=True)
        metadata_dataset.write_with_schema(df_metadata)

    if geometry_dataset is not None:
        geometry_dataset.write_with_schema(df_geometry_result)

    if log_api_dataset is not None:
        log_api_dataset.write_with_schema(df_api_log)
示例#6
0
def __get_logs_path(s=None):
    custom_variables = dataiku.get_custom_variables()
    logs_dir = osp.abspath(
        osp.join(custom_variables['dip.home'], "analysis-data",
                 custom_variables['projectKey'],
                 custom_variables['analysisId'], custom_variables['taskId'],
                 "sessions", s if s else custom_variables['sessionId'], 'pp1',
                 'm1', 'tensorboard_logs'))
    return logs_dir
示例#7
0
def create_target(row):
    revenue = row['revenue']
    v = int(dataiku.get_custom_variables()["revenue_value"])
    if revenue >= v:
        target = 1
    elif revenue < v:
        target = 0
    else:
        target = revenue
    return target
示例#8
0
def get_regr_models(df_out, current_tm_list):

    regr_max_date = (parser.parse(
        dataiku.get_custom_variables()['training_calculation_date']) +
                     relativedelta(months=-11)).strftime('%Y-%m-%d')
    regr_df = df_out[df_out['access_start_date'] <= (
        dataiku.get_custom_variables()['training_calculation_date'])]

    X_regr_dict = {}
    Y_regr_dict = {}

    for i in current_tm_list:
        current_tm_df = regr_df[regr_df['tenure_month'] == i - 1]
        Y_regr_dict['tm_{}'.format(i)] = current_tm_df['tenure_length_capped']
        X_regr_dict['tm_{}'.format(i)] = current_tm_df.copy()
        X_regr_dict['tm_{}'.format(i)] = X_regr_dict['tm_{}'.format(i)].drop(
            class_labels, axis=1)
        X_regr_dict['tm_{}'.format(i)] = X_regr_dict['tm_{}'.format(i)].drop(
            ['access_start_date', 'tenure_length_capped', 'is_churn'], axis=1)

    xgb_model_regr_dict = {}
    xgb_model_regr_labels = X_regr_dict['tm_3'].columns.values
    y_labels_dict = Y_regr_dict.copy()

    for i, current_tm in enumerate(X_regr_dict.keys()):
        start = dt.now()
        print("training xgb regressor for users in current month {}".format(
            current_tm))
        clf = xgb.XGBRegressor(
            objective='reg:squarederror',
            learning_rate=0.1,
            max_depth=7,
            gamma=0,
            reg_lambda=1,
            n_estimators=25,
        )

        clf = clf.fit(X_regr_dict[current_tm], y_labels_dict[current_tm])
        xgb_model_regr_dict[current_tm] = clf
        print('training took: {}'.format(dt.now() - start))

    return xgb_model_regr_dict, xgb_model_regr_labels
示例#9
0
def calculate_average_docomo_tenure_length(df_docomo,
                                           synchronization_time_days):

    one_year_ago = parser.parse(dataiku.get_custom_variables(
    )['training_calculation_date']) + relativedelta(
        months=-12, days=-synchronization_time_days)
    two_years_ago = parser.parse(dataiku.get_custom_variables(
    )['training_calculation_date']) + relativedelta(
        months=-24, days=-synchronization_time_days)
    df_docomo_time_filter_max = df_docomo['access_start_date'] < one_year_ago
    df_docomo_time_filter_min = df_docomo['access_start_date'] > two_years_ago

    df_docomo_filtered = df_docomo[df_docomo_time_filter_max
                                   & df_docomo_time_filter_min].copy()
    if df_docomo_filtered.size > 0:
        docomo_sleeping_babies_average_tenure_length = df_docomo_filtered[
            'tenure_length_capped'].mean()
    else:
        docomo_sleeping_babies_average_tenure_length = 12.0

    return docomo_sleeping_babies_average_tenure_length
示例#10
0
def add_prediction_identifiers(df_rfc_results_dict, test_dict):

    identifier_columns = ['cust_account_id','cust_territory', 'cust_country','access_start_date']
    prediction_columns = ['tenure_months_completed', 'current_tenure_month', 'prediction', 'pred_proba_target_is_3M','pred_proba_target_is_4M','pred_proba_target_is_5M','pred_proba_target_is_6M','pred_proba_target_is_7M','pred_proba_target_is_8M','pred_proba_target_is_9M','pred_proba_target_is_10M','pred_proba_target_is_11M','pred_proba_target_is_12M_plus']

    for current_month in df_rfc_results_dict:
        df_rfc_results_dict[current_month] = df_rfc_results_dict[current_month].join(test_dict[current_month])[identifier_columns + prediction_columns]
        prediction_date = dataiku.get_custom_variables()['prediction_date']
        df_rfc_results_dict[current_month]['prediction_date'] = datetime_object = dt.strptime(prediction_date, '%Y-%m-%d')
        df_rfc_results_dict[current_month]['inserted_at'] = dt.today()

    return df_rfc_results_dict
示例#11
0
def get_soup(link, headerName=None, params=None, verify=None):
    """ The deaders have to be defined as a custom variables at the project level. """
    if headerName:
        headers = json.loads(dk.get_custom_variables()[headerName])
    else:
        headers = None

    if not verify:
        r = rq.get(link, headers=headers, params=params)
    else:
        r = rq.get(link, headers=headers, params=params, verify=verify)
    soup = Soup(r.text, 'html.parser')
    return soup
示例#12
0
def add_pred_date_and_inserted_at_to_pred_df(df_ect_12m):
    """
    Add date of prediction and date inserted into the table into the prediction DataFrame

    Parameters
    ----------
    df_preds : DataFrame with predictions

    Returns
    -------
    df_ect_12m : DataFrame with both predictions and joined customer information
    """

    prediction_date = dataiku.get_custom_variables()['calculation_date']
    df_ect_12m['prediction_date'] = parse(prediction_date)
    df_ect_12m['inserted_at'] = dt.datetime.today()
示例#13
0
    def run(self, progress_callback):
        """
        Do stuff here. Can return a string or raise an exception.
        The progress_callback is a function expecting 1 value: current progress
        """

        params = get_params(self.config, self.client, self.project)
        root_path = dataiku.get_custom_variables(project_key=self.project_key)['dip.home']

        copy_plugin_to_dss_folder(self.plugin_id, root_path, params.get(
            "model_folder_id"), self.project_key, force_copy=True)
        create_api_code_env(self.plugin_id, root_path, self.client, params.get(
            'code_env_name'), params.get('use_gpu'))
        api_service = get_api_service(params, self.project)
        endpoint_settings = get_model_endpoint_settings(params)
        create_python_endpoint(api_service, endpoint_settings)
        html_str = get_html_result(params)

        return html_str
示例#14
0
def customize_tb_page(tb_page):
    custom_variables = dataiku.get_custom_variables()

    keep_alive_script = "window.parent.angular && setInterval(()=>{window.parent.angular.element(window.parent.document.body).injector(\"dataiku\").get(\"Notification\").publishToBackend(\"timeoutable-task-keepalive\",{\"projectKey\":\"" + \
                        custom_variables["projectKey"] + "\", \"taskId\":\"" + custom_variables[
                            "webappId"] + "\"});},1000*30)"

    new_tb_page = tb_page \
        .replace('#f57c00', '#2aaf5d') \
        .replace('#2aaf5d', '#55707D') \
        .replace('#ff7043', '#2aaf5d') \
        .replace('#ff9800', '#2aaf5d').replace('#FFB74D', '#2aaf5d')
    new_tb_page = add_to_page(
        new_tb_page, '<script>' + str(keep_alive_script) + '</script>')
    new_tb_page = add_to_page(
        new_tb_page,
        "<style>.sidebar.tf-scalar-dashboard {min-width: 200px;}.tf-runs-selector-0{display: none;}.tf-tensorboard-0 #toolbar.tf-tensorboard {background-color: #5f7d8c}</style>"
    )

    return new_tb_page
示例#15
0
def get_docomo_predictions(current_tm_list, rfc_models, df_docomo_test):

    docomo_avg_per_current_tm_dict = {}
    for i in current_tm_list:
        if 'docomo_tm_{}'.format(i) in rfc_models.keys():
            docomo_avg_per_current_tm_dict[i-1] = rfc_models['docomo_tm_{}'.format(i)]
        else:
            rfc_models['docomo_tm_{}'.format(i)] = 12.0

    df_docomo_w_prediction = df_docomo_test.copy()
    df_docomo_w_prediction['prediction'] = df_docomo_w_prediction['tenure_month'].map(docomo_avg_per_current_tm_dict)

    identifier_columns = ['cust_account_id','cust_territory', 'cust_country','access_start_date']
    prediction_columns = ['tenure_months_completed', 'current_tenure_month', 'prediction']
    prediction_date = dataiku.get_custom_variables()['prediction_date']

    df_docomo_w_prediction['tenure_months_completed'] = df_docomo_w_prediction['tenure_month']
    df_docomo_w_prediction['current_tenure_month'] = df_docomo_w_prediction['tenure_month']+1
    df_docomo_w_prediction = df_docomo_w_prediction[identifier_columns + prediction_columns]
    df_docomo_w_prediction['prediction_date'] = prediction_date
    df_docomo_w_prediction['inserted_at'] = dt.today().strftime('%Y-%m-%d %H:%M')

    return df_docomo_w_prediction
示例#16
0
# coding: utf-8

# In[1]:

import dataiku
import json
#print(json.dumps(job,indent=4))

client = dataiku.api_client()
current_project = client.get_project(
    dataiku.get_custom_variables()["projectKey"])
variables = current_project.get_variables()
general_settings = client.get_general_settings()

# In[2]:

all_datasets = current_project.list_datasets()

# In[6]:

for dataset_list_item in all_datasets:
    dataset = current_project.get_dataset(dataset_list_item["name"])
    dataset_settings = dataset.get_settings()
    if dataset_settings.get_raw()['type'] == 'Snowflake':
        print('===')
        print('original %s schema' % dataset_list_item["name"])
        print(json.dumps(dataset_settings.schema_columns, indent=2))
        for index in range(len(dataset_list_item['schema']['columns'])):
            if dataset_settings.schema_columns[index]['type'] == 'bigint':
                dataset_settings.schema_columns[index]['type'] = 'int'
                dataset_settings.save()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE_MAGIC_CELL
# Automatically replaced inline charts by "no-op" charts
# %pylab inline
import matplotlib
matplotlib.use("Agg")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import requests
import json

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
api_key = dataiku.get_custom_variables(
    project_key='PORTFOLIOANALYSIS').get('api_alpha')

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
url = 'https://www.alphavantage.co/query'


def get_daily_ts(symbol):

    data = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",
        "symbol": symbol,
        "apikey": api_key,
        "outputsize": "compact"
    }

    r = requests.get(url, params=data)
import dataiku
import pandas as pd
import json
import dataiku
from dataiku.scenario import Scenario

SCENARIO_ID = 'RECURSIVEBUILDOFTIMESERIES'
PROJECT_KEY = dataiku.get_custom_variables()["projectKey"]
client = dataiku.api_client()
p = client.get_project(PROJECT_KEY)


@app.route('/update/<path:params>')
def update(params):
    params = json.loads(params)
    variables = p.get_variables()
    variables["standard"] = params
    p.set_variables(variables)

    sc = p.get_scenario(SCENARIO_ID)
    try:
        sc.run_and_wait()
        jobSucceed = True
    except:
        jobSucceed = False

    return json.dumps({"status": str(jobSucceed)})
示例#19
0
# coding: utf-8

# In[1]:

import dataiku
import pandas as pd
# import xlsxwriter

client = dataiku.api_client()
project = client.get_project(dataiku.get_custom_variables()["projectKey"])
datasets = project.list_datasets()

# tag_name = 'sql_dataset'

result_dict = {'dataset': [], 'tags': []}
for index in range(len(datasets)):
    if datasets[index]['tags']:
        #         if tag_name in datasets[index]['tags']:
        #         result_dict = {'dataset':[],'tag':[]}
        result_dict['dataset'].append(datasets[index]['name'])
        result_dict['tags'].append(datasets[index]['tags'])
df = pd.DataFrame(data=result_dict)
# df.to_excel('output1.xlsx', engine='xlsxwriter')
df.to_excel('output1.xlsx')
#             print "dataset '{}' is tagged with '{}'".format(datasets[index]['name'],tag_name)
#         if datasets[index]['tags']=='sql_dataset':

#     if tag == 'sql_dataset':
#         print (datasets[index]['tags'])

# In[ ]:
示例#20
0
def get_x_and_y_train(current_tm_list, target_month_list, df_features_train,
                      synchronization_time_days, model_diagnostics):

    # Create empty dict for X and Y train sets per model
    X_train_dict = {}
    Y_train_dict = {}

    # Bring in the training calculation dates from the DSS project variables
    train_date = dataiku.get_custom_variables()['training_calculation_date']
    training_start_date = dataiku.get_custom_variables()['training_start_date']

    # For each current tenure month
    for i, current_tm in enumerate(current_tm_list):

        # For each target month per current tenure month
        for k, target_month in enumerate(target_month_list):
            # As long as the target month is greater or equal to the current month
            if k >= i:

                target_month_num = 2 + k
                print(
                    'building training set for current tm {} and target month {}'
                    .format(current_tm, target_month))
                current_tm_df = df_features_train[
                    df_features_train['tenure_month'] == current_tm - 1]

                # create end and start period for training
                training_end_date = parser.parse(train_date) + relativedelta(
                    months=-target_month_num, days=-synchronization_time_days)

                # convert datetime to string
                training_start_date_str = training_start_date
                training_end_date_str = training_end_date.strftime('%Y-%m-%d')

                # Create mask filters based on the calculation date and apsply using the access start date field
                start_mask = current_tm_df[
                    'access_start_date'] >= training_start_date_str
                end_mask = current_tm_df[
                    'access_start_date'] <= training_end_date_str
                current_tm_df = current_tm_df.loc[start_mask & end_mask]

                # Define the class labels to drop
                print("initial train sample size     : " +
                      str(current_tm_df.shape[0]))
                class_labels_to_drop = list(target_month_list)
                class_labels_to_drop.remove(target_month)
                print("class_labels_to_drop: {}".format(class_labels_to_drop))

                # Downsample to have a balanced split of the target
                class_weight = current_tm_df[target_month].sum(
                ) / 1.0 / current_tm_df.shape[0]
                print("class weight: " + str(class_weight))
                sample_size = current_tm_df[target_month].sum()
                df_features_train_sampled = current_tm_df.groupby(
                    target_month, group_keys=False).apply(
                        lambda group: group.sample(sample_size, replace=True))

                # Output to console
                total_train = df_features_train_sampled.shape[0]
                percentage_target_train = df_features_train_sampled[
                    target_month].sum() / float(total_train)
                print("downsampled train sample size     : " +
                      str(total_train))
                print("% target class        : " +
                      str(percentage_target_train))
                print(
                    'tm_{}_target_{}'.format(current_tm, target_month) +
                    ': train_samples: ' +
                    str(df_features_train_sampled.shape[0]) + '; start_time:',
                    df_features_train_sampled['access_start_date'].min(
                    ).strftime('%Y-%m-%d'), '; end_time:',
                    df_features_train_sampled['access_start_date'].max(
                    ).strftime('%Y-%m-%d'))

                # Output to the model diagnostics dict
                model_diagnostics['tm_{}_target_{}'.format(
                    current_tm, target_month
                )]['training_samples_available'] = df_features_train_sampled.shape[
                    0]
                model_diagnostics['tm_{}_target_{}'.format(
                    current_tm, target_month
                )]['train_start_date'] = df_features_train_sampled[
                    'access_start_date'].min().strftime('%Y-%m-%d')
                model_diagnostics['tm_{}_target_{}'.format(
                    current_tm, target_month
                )]['train_end_date'] = df_features_train_sampled[
                    'access_start_date'].max().strftime('%Y-%m-%d')

                # Create train X and Y
                X_train = df_features_train_sampled.copy()
                X_train = X_train.drop(class_labels_to_drop, axis=1)
                X_train = X_train.drop(target_month, axis=1)
                X_train = X_train.drop(
                    ["access_start_date", "tenure_length_capped", "is_churn"],
                    axis=1)
                Y_train = df_features_train_sampled.loc[:, [target_month
                                                            ]].values.ravel()

                # Add the X and Y train sets to the dict for all models
                X_train_dict['tm_{}_target_{}'.format(current_tm,
                                                      target_month)] = X_train
                Y_train_dict['tm_{}_target_{}'.format(current_tm,
                                                      target_month)] = Y_train

    return X_train_dict, Y_train_dict, model_diagnostics
示例#21
0
#import enchant
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import dataiku
import ast

prod_pronouns = dataiku.get_custom_variables(typed=True)['prod_pronouns']


def apply_extraction(row, nlp, sid, text_column, review_id, product_id):
    review_body = row[text_column]
    review_id = row[review_id]
    # review_marketplace = row['marketplace']
    # customer_id = row['customer_id']
    product_id = row[product_id]
    # product_parent = row['product_parent']
    # product_title = row['product_title']
    # product_category = row['product_category']
    # date = str(row['review_date'])
    # star_rating = row['star_rating']
    # url = add_amazonlink(product_id)

    doc = nlp(review_body)
    ner_heads = {ent.root.idx: ent for ent in doc.ents}

    ## FIRST RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect
    ## RULE = M is child of A with a relationshio of amod
    rule1_pairs = []
    for token in doc:
示例#22
0
from aspect_clustering.get_word_vectors import get_word_vectors
import dataiku
from sklearn import cluster
import ast

n_clusters = dataiku.get_custom_variables(typed=True)['NUM_CLUSTERS']
# print(n_clusters)
n_clusters = ast.literal_eval(n_clusters)


def get_word_clusters(unique_aspects, nlp):

    print("Found {} unique aspects for this product".format(
        len(unique_aspects)))
    asp_vectors = get_word_vectors(unique_aspects,
                                   nlp)  # gets the word vector for each noun
    print("len(unique_aspects)", len(unique_aspects))
    print("n_clusters", n_clusters)
    if len(unique_aspects) < n_clusters:
        print("Too few aspects ({}) found. No clustering required...".format(
            len(unique_aspects)))
        return list(range(len(unique_aspects)))

    print("Running k-means clustering...")
    kmeans = cluster.KMeans(n_clusters=n_clusters)
    kmeans.fit(asp_vectors)
    labels = kmeans.labels_
    return labels
示例#23
0
### Output gen
P_FEATURE_SELECTION_NB_FIELD_PER_OUTPUT = int(
    get_recipe_config()['param_nb_field_per_output'])  #200
P_GEN_UNIQUE_FILE = get_recipe_config()[
    'param_output_one_file_all_states']  #True
P_GENERATE_ALL_THE_CENSUS_LEVEL = get_recipe_config()[
    'param_output_only_matching_census_level']  #False

## Re us DL content
P_USE_PREVIOUS_SOURCES = get_recipe_config(
)['param_re_use_collected_census_sources']
P_DELETE_US_CENSUS_SOURCES = get_recipe_config()['param_delete_census_sources']

#------------------------------------------- END SETTINGS

path_datadir_tmp = dataiku.get_custom_variables()["dip.home"] + '/tmp/'
FOLDER_NAME = 'tmp_census_us_' + P_CENSUS_CONTENT

input_ = get_input_names_for_role('input')[0]

output_folder = dataiku.Folder(get_output_names_for_role('censusdata')[0])
path_ = output_folder.get_path() + '/'

print 'Checking if previous files exist...'
if len(os.listdir(path_)) > 0:
    for fz in os.listdir(path_):
        cmd = "rm %s" % (path_ + fz)
        print 'removing: %s' % (fz)
        os.system(cmd)

dict_states = common.get_state_structure(P_STATES_TYPE_NAME)
示例#24
0
def get_train_data(df_features,
                   synchronization_time_days,
                   model_diagnostics,
                   class_labels,
                   last_model_number=12):
    """
    Get train data for each model (most recent available 12 months) from all training data

    Parameters
    ----------
    df_features : DataFrame consisting of all data required for training (dummies, numeric features)
    synchronization_time_days : We know that data is not directly coming through;
                                being very conservative, it is set at 30 days
    model_diagnostics : dictionary into which diagnostics in training process will be inserted

    Returns
    -------
    all_class_labels : list with created class labels
    """

    train_date = dataiku.get_custom_variables()['calculation_date']

    df_features_train = {}
    for k, i in enumerate(class_labels):
        target_month = last_model_number - k

        # create end and start period for training
        training_end_date = parser.parse(train_date) + relativedelta(
            months=-target_month, days=-synchronization_time_days)
        training_start_date = parser.parse(train_date) + relativedelta(
            years=-1, months=-target_month, days=-synchronization_time_days)

        # convert datetime to string
        training_start_date_str = training_start_date.strftime('%Y-%m-%d')
        training_end_date_str = training_end_date.strftime('%Y-%m-%d')

        start_mask = df_features['access_start_date'] >= training_start_date_str
        end_mask = df_features['access_start_date'] <= training_end_date_str
        status_unknown = (df_features['tenure_length_capped']
                          == target_month) & (df_features['is_churn'] == 0)

        df_features_train_sampled = df_features.loc[start_mask & end_mask
                                                    & ~status_unknown]

        df_features_train[i] = df_features_train_sampled

        model_diagnostics[i][
            'training_samples_available'] = df_features_train_sampled.shape[0]
        model_diagnostics[i][
            'train_actuals_start_date'] = df_features_train_sampled[
                'access_start_date'].min().strftime('%Y-%m-%d')
        model_diagnostics[i][
            'train_actuals_end_date'] = df_features_train_sampled[
                'access_start_date'].max().strftime('%Y-%m-%d')
        model_diagnostics[i][
            'training_time_range_start_date'] = training_start_date
        model_diagnostics[i]['train_time_range_end_date'] = training_end_date

        print(
            i + ': train_samples: ' + str(df_features_train_sampled.shape[0]) +
            '; start_time:',
            df_features_train_sampled['access_start_date'].min().strftime(
                '%Y-%m-%d'), '; end_time:',
            df_features_train_sampled['access_start_date'].max().strftime(
                '%Y-%m-%d'))

    return df_features_train, model_diagnostics
示例#25
0

settings_TEST = dataiku.api_client().get_project('SOME_ETL_BASICS').get_settings()
# all_datasets = current_project.list_datasets()
settings_TEST.get_raw()['exposedObjects']['objects'][0]


# In[ ]:


settings = current_project_1.get_settings()
exposed_objects = settings.get_raw()['exposedObjects']['objects']
#     print json.dumps(exposed_objects,indent=2)
if len(exposed_objects):
    print('not 0')
    print('=== {} exposed objects ==='.format(dataiku.get_custom_variables()["projectKey"])
#     for index in range(len(exposed_objects)):
#         ds_nm = exposed_objects[index]['localName']
#         trg_prj = exposed_objects[index]['rules']
# #             print json.dumps(exposed_objects[index],indent=2)
#         trg_prj_names = []
#         for index in range(len(trg_prj)):
#             trg_prj_names.append(trg_prj[index]['targetProject'])
#         print("Dataset %s is exposed in %s project(s)" % (ds_nm,', '.join(trg_prj_names)))
#     print('\n')


# In[ ]:


exposed_objects
示例#26
0
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
images = dataiku.Folder("HNEvJqgm")
images_info = images.get_info()
images_folder_path = images.get_path()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
pdfs_processed = dataiku.Folder("w3DdXIhY")
pdfs_processed_info = pdfs_processed.get_info()
pdfs_folder_path = pdfs_processed.get_path()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# If project variable is set to true, all outputs cleared and PDF processing is re-run for all input PDFs.
if dataiku.get_custom_variables()["reprocess_PDFs"].lower() == "true":
    for output_pdf_folder in os.listdir(pdfs_folder_path):
        output_pdf_folder_path = os.path.join(pdfs_folder_path,
                                              output_pdf_folder)
        for page in os.listdir(output_pdf_folder_path):
            page_path = os.path.join(output_pdf_folder_path, page)
            os.remove(page_path)
        os.rmdir(output_pdf_folder_path)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
for pdf_folder in os.listdir(images_folder_path):
    pdf_folder_path = os.path.join(images_folder_path, pdf_folder)
    PDF_pages = os.listdir(os.path.join(images_folder_path, pdf_folder))
    PDF_pages_path = [
        os.path.join(pdf_folder_path, page)
        for page in os.listdir(pdf_folder_path)
folder_path = dataiku.Folder("managed_ds").get_path()
managed_folder_name = "managed_ds"
today = dt.datetime.now().date()


# In[23]:


for file in dataiku.Folder(managed_folder_name).list_paths_in_partition():
#     print(folder_path+file)
    if re.match(r"/my_file*", file):
#         print (file)
        filetime = dt.datetime.fromtimestamp(os.path.getctime(folder_path+file))
        if filetime.date() == today:
#             print('today')
            current_project = dataiku.api_client().get_project(dataiku.get_custom_variables()["projectKey"])
            variables = current_project.get_variables()
            variables['standard']['file_uploaded_today'] = 'yes'
            current_project.set_variables(variables)


# In[26]:


# from dataiku.scenario import Trigger
# t = Trigger()

import dataiku
import os
import re
import datetime as dt
示例#28
0
import os
import dataiku
import json
import pandas as pd, numpy as np
import socket
from dataikuapi.dssclient import DSSClient
from dataiku.customrecipe import *
import subprocess

input_folder = dataiku.Folder(get_input_names_for_role('folder_to_parse')[0])
project_name = dataiku.get_custom_variables()["projectKey"]
dip_home = dataiku.get_custom_variables()["dip.home"]
port_file = dip_home + '/bin/env-default.sh'

#Let's create an api_key if it does not exist

def retrieve_api_key():
    api_keys_dir = '/config/public-apikeys.json'
    with open(dip_home+api_keys_dir) as data_file:
        data_ = json.load(data_file)
    existing = False
    api_key = None
    for key in data_:
        try :
            if (key['createdBy'] == 'CLI' and key['globalAdmin'] == True and key['label'] == 'Added by dku command-line'):
                existing = False
                api_key = key['key']
                break
        except KeyError:
            pass
    return [existing, api_key]
示例#29
0
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
s_ave = df.groupby([u'場所'])[u'年初からの日数'].expanding().mean().values

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df[u'年初からの日数_暦年平均']=s_ave

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df = df.sort_values([u'場所', u'年月日_parsed'], ascending=False).reset_index(drop=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df.head()
"""

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
length = dataiku.get_custom_variables(typed=True)["window_size"]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
var_to_vectorize = [u'平均気温', u'最高気温', u'最低気温', u'降水量合計', u'日照時間']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
new_df = pd.DataFrame()
for city in df[u'場所'].unique():
    print(city)
    df_sub = df.loc[df[u'場所']==city].reset_index(drop=True)
    for var in var_to_vectorize:
        print(var)
        colname = var + u"推移"
        df_sub[colname] = np.nan
        long_vec = df_sub[var].values.tolist()