# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # -*- coding: utf-8 -*- import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Read recipe inputs blooming_days_joined = dataiku.Dataset("blooming_days_joined") df = blooming_days_joined.get_dataframe() # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE df.tail() # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE d = {} for year in df.year.unique(): blooming_day = df[df.year == year].blooming_day.unique()[0] d[year] = blooming_day # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE d # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE df_bloom = df[df.flower_status == 'bloom'] df_bloom # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE df_new = pd.DataFrame() for year in np.sort(df.year.unique()): print(year)
from scipy.spatial.distance import cosine, euclidean import logging FORMAT = '[SENTENCE EMBEDDING] %(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger() logger.setLevel(logging.INFO) ################################## # Input data ################################## input_dataset = get_input_names_for_role('input_dataset')[0] df = dataiku.Dataset(input_dataset).get_dataframe() embedding_folder = get_input_names_for_role('embedding_folder')[0] folder_path = dataiku.Folder(embedding_folder).get_path() ################################## # Parameters ################################## recipe_config = get_recipe_config() text_column_names = [] for i in [1, 2]: column_name = recipe_config.get('text_column_{}'.format(i), None) if column_name is None:
from dataiku.customrecipe import * import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu import json import requests import math import salesforce # Inputs and outputs are defined by roles. In the recipe's I/O tab, the user can associate one # or more dataset to each input and output role. # Roles need to be defined in recipe.json, in the inputRoles and outputRoles fields. input_dataset = get_input_names_for_role('main')[0] # The dataset objects themselves can then be created like this: df = dataiku.Dataset(input_dataset).get_dataframe() # Note about typing: # The configuration of the recipe is passed through a JSON object # As such, INT parameters of the recipe are received in the get_recipe_config() dict as a Python float. # If you absolutely require a Python int, use int(get_recipe_config()["my_int_param"]) ############################# # Your original recipe ############################# #FILE_TOKEN=dataiku.get_custom_variables()['dip.home']+'/salesforce/sales_cloud_token.json' FILE_TOKEN = get_recipe_config()['token'] COLUMNS = get_recipe_config()['COLUMNS'] OBJECT = get_recipe_config()['SF_OBJECT']
def get_club_histo_elo_flag(home_flag, club_id, dataNm): executor = dk.core.sql.SQLExecutor2(dataset=dk.Dataset(dataNm)) mess = footbet_lstm_elo_flag(home_flag, club_id, dataNm) d = executor.query_to_df(mess) return d.values
import dataiku import pandas as pd, numpy as np from sklearn import * from dataiku import pandasutils as pdu # # A) Analyze and generate the features for each dataset # In this step, I firstly analyzed the datasets shops, items and item categories and I observed that there was one common characteristic. This characteristic was that all of these datasets were basically lists of objects (store, product and category) and I believed that I needed to learn from the description of each line, so I generated more features based on the text column to get some insights. # For example, in the item category dataset, at the beginning I only had 2 columns which indicate the name and the ID, as it is seen in the table below. # In[12]: #loading the dataset item_categories = dataiku.Dataset("item_categories") item_categories_df = item_categories.get_dataframe() item_categories_df.head() # After analyzing this dataset, I added some other features such as the length of the category name, the number of words that the category name has, and some tfid columns that will help the computer to know the importance of a word among all the category names and through this way the computer learns something from them. # In[13]: feature_cnt = 25 #define the maximum number of features I want to generate # In[14]: #Text Features for the item_categories dataset tfidf = feature_extraction.text.TfidfVectorizer(max_features=feature_cnt)
# coding: utf-8 # In[ ]: import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu # Read recipe inputs airline_tweets_unique = dataiku.Dataset("airline_tweets_unique") airline_tweets_unique_df = airline_tweets_unique.get_dataframe() ##test ##v2 # Compute recipe outputs from inputs # TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe # NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc. output_df = airline_tweets_unique_df # For this sample code, simply copy input to output # Write recipe outputs output = dataiku.Dataset("output") output.write_with_schema(output_df)
def get_club_histo_defence_form_test(club_id, match_dt, w, dataNm): executor = dk.core.sql.SQLExecutor2(dataset=dk.Dataset(dataNm)) mess = footbet_lstm_defence_form_test(club_id, match_dt, w, dataNm) d = executor.query_to_df(mess) return d.values
input_config = get_recipe_config() workbook_name = input_config.get('output_workbook_name', None) if workbook_name is None: logger.warning( "Received input received recipe config: {}".format(input_config)) raise ValueError('Could not read the workbook name.') output_file_name = '{}.xlsx'.format(workbook_name) try: validate_filename(output_file_name) except ValidationError as e: raise ValueError(f"{e}\n") tmp_file_helper = CustomTmpFile() tmp_file_path = tmp_file_helper.get_temporary_cache_file(output_file_name) logger.info( "Intend to write the output xls file to the following location: {}".format( tmp_file_path)) dataframes_to_xlsx(input_datasets_names, tmp_file_path, lambda name: dataiku.Dataset(name).get_dataframe()) with open(tmp_file_path, 'rb', encoding=None) as f: output_folder.upload_stream(output_file_name, f) tmp_file_helper.destroy_cache() logger.info("Ended recipe processing.")
import numpy as np import re if "objects" not in get_webapp_config(): raise ValueError("Objects folder not specified. Go to settings tab.") if "frames" not in get_webapp_config(): raise ValueError("Frames folder not specified. Go to settings tab.") if "dataset" not in get_webapp_config(): raise ValueError("Output dataset not specified. Go to settings tab.") dataset_name = get_webapp_config()["dataset"] objects_id = get_webapp_config()["objects"] frames_id = get_webapp_config()["frames"] dataset = dataiku.Dataset(dataset_name) objects = dataiku.Folder(objects_id) frames = dataiku.Folder(frames_id) try: current_schema = dataset.read_schema() current_schema_columns = [c['name'] for c in current_schema] except: current_schema_columns = ["path", "class", "comment"] dataset.write_schema([{"name": "path", "type": "string"}, {"name": "class", "type": "string"}, {"name": "comment", "type": "string"}]) if 'path' not in current_schema_columns or 'class' not in current_schema_columns or 'comment' not in current_schema_columns: raise ValueError("The target dataset should have columns: 'path', 'class' and 'comment'. Please edit the schema in the dataset settings.") try: current_df = dataset.get_dataframe()
def get_label_dataset(inputs): label_dataset_full_name = get_input_name_from_role(inputs, "label_dataset") label_dataset = dataiku.Dataset(label_dataset_full_name) return label_dataset
import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu # Read recipe inputs m_query_20200512 = dataiku.Dataset("MQuery_05152020_RK") m_query_20200512_df = m_query_20200512.get_dataframe() # Compute recipe outputs from inputs # TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe # NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc. m_query_clean_df = m_query_20200512_df # For this sample code, simply copy input to output m_query_clean_df.head(10) import pandas as pd # Preserve only those rows having the GoalType = 'Target' m_query_clean_df = m_query_clean_df.loc[m_query_clean_df['GoalType'] == 'Target'] set(list(m_query_clean_df['GoalType'])) # Total number of unique clients and authors print("Unique Clients: ", m_query_clean_df['ClientId'].nunique()) print("Unique Authors: ", m_query_clean_df['TrialAuthorId'].nunique()) # Clean GoalMetDate column dataset_new = m_query_clean_df[ m_query_clean_df['GoalMetDate'].map(type) != float] dataset_new = dataset_new[
# -*- coding: utf-8 -*- """ Created on Fri Feb 8 11:11:12 2019 @author: Hakim Razzak """ # -*- coding: utf-8 -*- import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu import matplotlib.pyplot as plt # Read recipe inputs dataset_data_attr_activity_filtered2_6months = dataiku.Dataset("dataset_data_attr_activity_filtered2_6months") dataset_df = dataset_data_attr_activity_filtered2_6months.get_dataframe() df = dataset_df[dataset_df['dim_country'] == 'US'] df = df[df['dim_app'].str.contains('Alizoha')] df = df.rename(columns=lambda x: x.replace('_sum', '')) df = df.drop('avg_dau', 1) df = df.sort_values(by = 'week', ascending=False) df df_train = df.values[0:8,] df_train = pd.DataFrame(df_train) df_train
'nb_features_per_output': P_FEATURE_SELECTION_NB_FIELD_PER_OUTPUT, 'unique_file': P_GEN_UNIQUE_FILE, 'only_matching_level': P_GENERATE_ALL_THE_CENSUS_LEVEL, 'target': P_TARGET, 'imputing': P_strategy, 'imputing_threshold': P_threshold, 'rescale': P_RESCALE } df_log_ = common.log__step('0', params, process_date, '', 0, '', 'init') #----------------------------------------- INPUT DATASET print '0/6 Processing input dataset...' df = dataiku.Dataset(input_).get_dataframe(columns=columns) if P_COLUMN_STATES_LOWER is True: df[P_COLUMN_STATES] = df[P_COLUMN_STATES].map(lambda x: x.lower()) print 'Creating States list...' state_list_ = list(np.unique(df[P_COLUMN_STATES])) state_conversion = common.state_to_2letters_format(P_STATES_TYPE_NAME, state_list_) state_list = state_conversion[0] state_list_rejected = state_conversion[1] dict_states = state_conversion[2] s_found = len(state_list) s_rejected = len(state_list_rejected)
# ============================================================================== api_configuration_preset = get_recipe_config().get("api_configuration_preset") if api_configuration_preset is None or api_configuration_preset == {}: raise ValueError("Please specify an API configuration preset") service_account_key = api_configuration_preset.get("gcp_service_account_key") api_quota_rate_limit = api_configuration_preset.get("api_quota_rate_limit") api_quota_period = api_configuration_preset.get("api_quota_period") parallel_workers = api_configuration_preset.get("parallel_workers") text_column = get_recipe_config().get("text_column") text_language = get_recipe_config().get("language", "").replace("auto", "") sentiment_scale = get_recipe_config().get("sentiment_scale") error_handling = ErrorHandlingEnum[get_recipe_config().get("error_handling")] input_dataset_name = get_input_names_for_role("input_dataset")[0] input_dataset = dataiku.Dataset(input_dataset_name) input_schema = input_dataset.read_schema() input_columns_names = [col["name"] for col in input_schema] output_dataset_name = get_output_names_for_role("output_dataset")[0] output_dataset = dataiku.Dataset(output_dataset_name) validate_column_input(text_column, input_columns_names) input_df = input_dataset.get_dataframe() client = get_client(service_account_key) column_prefix = "sentiment_api" # ============================================================================== # RUN # ==============================================================================
def load_plugin_config_spellchecker() -> Dict: """Utility function to validate and load spell checker parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # custom_vocabulary (optional input dataset) params["custom_vocabulary_set"] = set() custom_vocabulary_input = get_input_names_for_role("custom_vocabulary") if len(custom_vocabulary_input) != 0: custom_vocabulary_dataset = dataiku.Dataset(custom_vocabulary_input[0]) params["custom_vocabulary_set"] = custom_vocabulary_checker( custom_vocabulary_dataset) logging.info(f"Custom vocabulary set: {params['custom_vocabulary_set']}") # custom_corrections (optional input dataset) params["custom_corrections"] = {} custom_corrections_input = get_input_names_for_role("custom_corrections") if len(custom_corrections_input) != 0: custom_corrections_dataset = dataiku.Dataset( custom_corrections_input[0]) params["custom_corrections"] = custom_corrections_checker( custom_corrections_dataset) logging.info(f"Custom corrections: {params['custom_corrections']}") # diagnosis dataset (optional output dataset) diagnosis_dataset_names = get_output_names_for_role("diagnosis_dataset") params["diagnosis_dataset"] = None params["compute_diagnosis"] = False if len(diagnosis_dataset_names) != 0: logging.info("Spellchecker diagnosis will be computed") params["compute_diagnosis"] = True params["diagnosis_dataset"] = dataiku.Dataset( diagnosis_dataset_names[0]) else: logging.info("Spellchecker diagnosis will not be computed") # path to the folder of stopwords params["stopwords_folder_path"] = os.path.join(get_recipe_resource(), "stopwords") # path to the folder of dictionaries params["dictionary_folder_path"] = os.path.join(get_recipe_resource(), "dictionaries") # Text column selection params["text_column"] = recipe_config.get("text_column") logging.info(f"Text column: {params['text_column']}") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") # Language selection params["language"] = recipe_config.get("language") if params["language"] == "language_column": params["language_column"] = recipe_config.get("language_column") if params["language_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid language column selection: : {params['language_column']}" ) logging.info(f"Language column: {params['language_column']}") else: if not params["language"]: raise PluginParamValidationError("Empty language selection") if params["language"] not in SUPPORTED_LANGUAGES_SYMSPELL: raise PluginParamValidationError( f"Unsupported language code: {params['language']}") params["language_column"] = "" logging.info(f"Language: {params['language']}") # Expert mode if recipe_config.get("expert"): logging.info("Expert mode is enabled") else: logging.info("Expert mode is disabled") # edit distance params["edit_distance"] = recipe_config.get("edit_distance") if params["edit_distance"] < 2 or params["edit_distance"] > 100: raise PluginParamValidationError( "Edit distance must be between 2 and 100") logging.info(f"Maximum edit distance: {params['edit_distance']}") # ignore token if len(recipe_config.get("ignore_word_regex")) == 0: logging.info("No regular expression for words not to be corrected") params["ignore_word_regex"] = None # symspellpy wants None else: params["ignore_word_regex"] = recipe_config.get("ignore_word_regex") # Check for valid regex try: ignore_token_compiled = re.compile(params["ignore_word_regex"]) except re.error as e: raise PluginParamValidationError( f"Ignore pattern parameter is not a valid regex: {e}") params["ignore_word_regex"] = ignore_token_compiled.pattern logging.info( f"Regular expression for words not to be corrected: {params['ignore_word_regex']}" ) return params
import pandas as pd import logging # Import smtplib for the actual sending function import smtplib # Import the email modules we'll need from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from email.mime.application import MIMEApplication import StringIO logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Get handles on datasets output_A_names = get_output_names_for_role('output') output = dataiku.Dataset( output_A_names[0]) if len(output_A_names) > 0 else None people = dataiku.Dataset(get_input_names_for_role('contacts')[0]) attachments = [ dataiku.Dataset(x) for x in get_input_names_for_role('attachments') ] # Read configuration config = get_recipe_config() recipient_column = config.get('recipient_column', None) recipient_value = config.get('recipient_value', None) sender_column = config.get('sender_column', None) sender_value = config.get('sender_value', None) subject_column = config.get('subject_column', None) subject_value = config.get('subject_value', None)
def load_plugin_config_cleaning() -> Dict: """Utility function to validate and load text cleaning parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # path to the folder of stopwords params["stopwords_folder_path"] = os.path.join(get_recipe_resource(), "stopwords") # Text column selection params["text_column"] = recipe_config.get("text_column") logging.info(f"Text column: {params['text_column']}") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") # Language selection params["language"] = recipe_config.get("language") if params["language"] == "language_column": params["language_column"] = recipe_config.get("language_column") if params["language_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid language column selection: {params['language_column']}" ) logging.info(f"Language column: {params['language_column']}") else: if not params["language"]: raise PluginParamValidationError("Empty language selection") if params["language"] not in SUPPORTED_LANGUAGES_SPACY: raise PluginParamValidationError( f"Unsupported language code: {params['language']}") params["language_column"] = "" logging.info(f"Language: {params['language']}") # Cleaning parameters params["token_filters"] = set(recipe_config.get("token_filters", [])) available_token_filters = set( MultilingualTokenizer.DEFAULT_FILTER_TOKEN_ATTRIBUTES.keys()) if not params["token_filters"] <= available_token_filters: raise PluginParamValidationError( f"Invalid token filters: {params['token_filters']-available_token_filters}" ) logging.info(f"Token filters: {params['token_filters']}") params["lemmatization"] = bool(recipe_config.get("lemmatization")) logging.info(f"Lemmatization: {params['lemmatization']}") params["lowercase"] = bool(recipe_config.get("lowercase")) logging.info(f"Lowercase: {params['lowercase']}") # Expert mode if recipe_config.get("expert"): logging.info("Expert mode is enabled") else: logging.info("Expert mode is disabled") params["unicode_normalization"] = UnicodeNormalization[recipe_config.get( "unicode_normalization")] logging.info(f"Unicode normalization: {params['unicode_normalization']}") params["keep_filtered_tokens"] = bool( recipe_config.get("keep_filtered_tokens")) logging.info(f"Keep filtered tokens: {params['keep_filtered_tokens']}") return params
import dataiku from dataiku.customrecipe import * import pandas as pd, numpy as np from dataiku import pandasutils as pdu import matplotlib import wordcloud matplotlib.use('Agg') import matplotlib.pyplot as plt import dataiku.insights ############################# # Recipe parameter import ############################# input_dataset_name = get_input_names_for_role('input_dataset')[0] df = dataiku.Dataset(input_dataset_name).get_dataframe() output_folder_name = get_output_names_for_role('output_folder')[0] folder = dataiku.Folder(output_folder_name) wordcloud_path = folder.get_path() static_insight_id = get_recipe_config()['static_insight_id'] text_column = get_recipe_config()['text_column'] ############################# # Code of the recipe ############################# ## wordcloud creation wc = wordcloud.WordCloud( background_color='white',
#campStartdt = get_input_names_for_role('start_date')[0] campStartdt=get_recipe_config().get('start_date', '2016:01:01') mjt.campStartdt=campStartdt # Get your environment Mailjet keys auth, saname, said = mjt.readSa() #create empty lists d_co_dflst=[]#will contain the list of campaign overview dataframe co_d_count=0 print '##############get campaign data############################' sa_dict=dict() for sa in range(len(auth)): mj = Client(auth=auth[sa]) sa_dict[saname[sa]]=auth[sa][0] co_d_count, df_co=mjt.getContactData(mj) if co_d_count > 0: df_co['SubAccount']=saname[sa] d_co_dflst.append(df_co) #union of all dataframes if co_d_count > 0: df=pd.concat(d_co_dflst) df["SubAccount_PK"]=df["SubAccount"].map(lambda x : sa_dict[x]) #and we write the resulting dataset to dataiku # Recipe outputs # Recipe outputs for contact list main_output_name = get_output_names_for_role('customers')[0] output_dataset = dataiku.Dataset(main_output_name) output_dataset.write_with_schema(df)
'type': 'float' if column in floats else 'string' }) ods.write_schema(schema) ow = ods.get_writer() # Then the full pass dataset_iter = ids.iter_dataframes(chunksize=lines_per_request, infer_with_pandas=False, limit=limit) process_queue = Queue(threads) write_queue = Queue() for i, chunk in enumerate(dataset_iter): process_queue.put(i) thread = Process( target=process_chunk, args=[i, chunk, process_queue, write_queue, output_index]) thread.start() while write_queue.qsize() > 0: ow.write_dataframe(write_queue.get()) print("Waiting for{} chunk processes".format(process_queue.qsize())) while process_queue.qsize() > 0: time.sleep(1) print("Flushing {} chunks".format(write_queue.qsize())) while write_queue.qsize() > 0: ow.write_dataframe(write_queue.get()) ow.close() ids = dataiku.Dataset(IDS_NAME) ods = dataiku.Dataset(ODS_NAME) geocode(ids, ods)
def get_club_histo_attack_global_test(club_id, dataNm): executor = dk.core.sql.SQLExecutor2(dataset=dk.Dataset(dataNm)) mess = footbet_lstm_goal_attack_test(club_id, dataNm) d = executor.query_to_df(mess) return d.values
# -*- coding: utf-8 -*- import pandas as pd import tmdbsimple as tmdb import requests, time import dataiku from dataiku.customrecipe import * tmdb.API_KEY = get_plugin_config()['tmdb_api_key'] input_dataset = dataiku.Dataset(get_input_names_for_role('input_dataset')[0]) title_col = get_recipe_config().get('title_col') release_date_col = get_recipe_config().get('release_date_col',None) results = [] results_notFound = [] for row in input_dataset.iter_rows(log_every=10): title = row[title_col] print "looking up", title.encode('utf-8') try: response = tmdb.Search().movie(query=title)["results"] time.sleep(0.05) except requests.exceptions.HTTPError as e: print 'Error:', e results_notFound.append({'title_queried': title, 'error': e}) continue if len(response) == 0: print 'Error: no matches' results_notFound.append({'title_queried': title, 'error': 'no matches'}) continue movies = pd.DataFrame(response) # choose best match
currency_base = get_recipe_config()['currency_base'] currency_to = get_recipe_config()['currency_to'] # output using the core code url = "https://api.exchangeratesapi.io/history" querystring = {"start_at":start_date,"end_at":end_date,"base":currency_base,"symbols":currency_to} print('QUERY ----', querystring) payload = "" headers = "" response = requests.request("GET", url, data=payload, headers="", params=querystring) dates = response.json()['rates'].keys() rates = [] currency_from = querystring['base'] currency_to = querystring['symbols'] for i in range(len(dates)): rates.append(response.json()['rates'].values()[i].values()[0]) df = pd.DataFrame({'day':dates,'exchange_rate_%s_to_%s'%(currency_from,currency_to):rates}).sort_values('day',ascending=False).reset_index(drop=True) df['day'] = pd.to_datetime(df['day']) # Write recipe outputs add_exchange_rates = dataiku.Dataset(output) add_exchange_rates.write_with_schema(df)
import urllib import time from dataiku.customrecipe import * import sys # disable InsecureRequestWarning. requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) print ('## Running Plugin v0.5.0 ##') input_name = get_input_names_for_role('input')[0] # Recipe out output_ = get_output_names_for_role('output')[0] output_dataset = dataiku.Dataset(output_) schema = [ {'name': 'matchedAddress', 'type': 'string'} , {'name': 'latitude', 'type': 'double'} , {'name': 'longitude', 'type': 'double'} , {'name': 'tigerLineId', 'type': 'string'} , {'name': 'side', 'type': 'string'} , {'name': 'preDirection', 'type': 'string'} , {'name': 'streetName', 'type': 'string'} , {'name': 'suffixType', 'type': 'string'} , {'name': 'suffixDirection', 'type': 'string'} , {'name': 'city', 'type': 'string'} , {'name': 'state', 'type': 'string'} , {'name': 'zip', 'type': 'string'} ]
"%a %b %d %H:%M:%S %Y")) o["id_str"] = response.data["id_str"] o["favourites_count"] = response.data["favourites_count"] o["geo_enabled"] = response.data["geo_enabled"] o["statuses_count"] = response.data["statuses_count"] o["following"] = response.data["following"] o["follow_request_sent"] = response.data["follow_request_sent"] o["notifications"] = response.data["notifications"] o["entities"] = json.dumps(response.data.entities) results.append(o) # calculate interval interval = calc_interval(response.headers) else: interval = DEFAULT_INTERVAL nb_done = nb_done + 1 #if nb_done == 1: # break print "Sleep " + str(interval) + " s" time.sleep(interval) odf = pd.DataFrame(results) if odf.size > 0: # Recipe outputs followers_info = dataiku.Dataset(output_dataset_name) followers_info.write_with_schema(odf)
'datacollection_search') P_DERIVATIVE = get_recipe_config()['add_derivative'] P_RETURN_GEOMETRY = get_recipe_config()['return_geometry'] P_OPTION_DATA_AS_TRANSACTIONS = get_recipe_config( )['store_enrichment_as_key_value'] P_ACTIVATE_BACKUP = get_recipe_config()['activate_backup'] P_EPSG_IN_SR = int(get_recipe_config()['in_sr']) input_name = get_input_names_for_role('input')[0] # If we have a data catalog, search data collections from it input_catalog_names = get_input_names_for_role('datacollections') df_datacollections = None if len(input_catalog_names) > 0: df_datacollections = dataiku.Dataset( input_catalog_names[0]).get_dataframe() print "Input catalog enabled total_coll=%s specific=%s search=%s" % ( df_datacollections.shape[0], P_SPECIFIC_DATACOLLECTION_ID, P_DATACOLLECTIONS_SEARCH) if P_SPECIFIC_DATACOLLECTION_ID is not None: df_datacollections1 = df_datacollections[ df_datacollections['collection_id'] == P_SPECIFIC_DATACOLLECTION_ID] nb_records_df_datacollections1 = df_datacollections1.shape[0] print 'info : Before country filtering nb records in data collections dataset (specific collection) : %s' % ( nb_records_df_datacollections1) if P_DATACOLLECTIONS_SEARCH is not None: df_datacollections2 = df_datacollections[ df_datacollections['collection_long_description'].str.contains(
# -*- coding: utf-8 -*- import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu # Read recipe inputs new_customers = dataiku.Dataset("new_customers") new_customers_df = new_customers.get_dataframe() # Compute recipe outputs from inputs # TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe # NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc. out1_df = new_customers_df # For this sample code, simply copy input to output # Write recipe outputs out1 = dataiku.Dataset("out1") out1.write_with_schema(out1_df)
# -*- coding: utf-8 -*- import dataiku from dataiku.customrecipe import get_output_names_for_role, get_recipe_config import json import requests import datetime # Output output_name = get_output_names_for_role('main')[0] output = dataiku.Dataset(output_name) output.write_schema([{ 'name': 'datetime', 'type': 'string' }, { 'name': 'status_code', 'type': 'string' }, { 'name': 'result', 'type': 'string' }, { 'name': 'path_file', 'type': 'string' }]) # Read configuration config = get_recipe_config() client_id = config.get('client_id', '') client_secret = config.get('client_secret', '') email = config.get('email', '') password = config.get('password', '') sandbox = config.get('sandbox', False)
# -*- coding: utf-8 -*- import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu # Read recipe inputs fico_join = dataiku.Dataset("fico_join") fico_join_df = fico_join.get_dataframe() # Compute recipe outputs # TODO: Write here your actual code that computes the outputs # NB: DSS supports several kinds of APIs for reading and writing data. Please see doc. fico_historic_df = fico_join_df[fico_join_df['Default'].notnull() == True] fico_to_predict_df = fico_join_df[fico_join_df['Default'].isnull() == True] # Write recipe outputs fico_historic = dataiku.Dataset("fico_historic") fico_historic.write_with_schema(fico_historic_df) fico_to_predict = dataiku.Dataset("fico_to_predict") fico_to_predict.write_with_schema(fico_to_predict_df)
partnerDetails = getPartnerDetails(f['partner_function'], map) if partnerDetails != None: d['partnerFunction'] = partnerDetails # print('Working partner check') # print(d) choices.append(d) print('=====================================end of ' + fle.get("coprocessor") + '================================') except ValueError, e: logging.info("file is not valid json") # print('=====================================end of ' + fle.get("coprocessor") + '================================') # Get input table metadata. input_table_name = inputs[0]['fullName'].split('.')[1] input_dataset = dataiku.Dataset(input_table_name) schema = input_dataset.read_schema() # print('was able to obtain input schemas') inputschemas = {} for inputdataset in inputs: inputtablename = inputdataset['fullName'].split('.')[1] inputdataset = dataiku.Dataset(inputtablename) inputschemas[inputtablename] = inputdataset.read_schema() # print('was able to forloop input schemas') # AAF schema from connection details connection = getConnectionParamsFromDataset(input_dataset) aafschema = ([property.get('value', '') for property in connection.\