Python Dataset 예제들, dataiku.Dataset Python 예제들

예제 #1

0

파일 보기

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
blooming_days_joined = dataiku.Dataset("blooming_days_joined")
df = blooming_days_joined.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df.tail()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
d = {}
for year in df.year.unique():
    blooming_day = df[df.year == year].blooming_day.unique()[0]
    d[year] = blooming_day

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
d

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_bloom = df[df.flower_status == 'bloom']
df_bloom

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_new = pd.DataFrame()
for year in np.sort(df.year.unique()):
    print(year)

예제 #2

0

파일 보기

from scipy.spatial.distance import cosine, euclidean

import logging

FORMAT = '[SENTENCE EMBEDDING] %(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger()
logger.setLevel(logging.INFO)


##################################
# Input data
##################################

input_dataset = get_input_names_for_role('input_dataset')[0]
df = dataiku.Dataset(input_dataset).get_dataframe()

embedding_folder = get_input_names_for_role('embedding_folder')[0]
folder_path = dataiku.Folder(embedding_folder).get_path()


##################################
# Parameters
##################################

recipe_config = get_recipe_config()

text_column_names = []
for i in [1, 2]:
    column_name = recipe_config.get('text_column_{}'.format(i), None)
    if column_name is None:

예제 #3

0

파일 보기

from dataiku.customrecipe import *
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import json
import requests
import math
import salesforce

# Inputs and outputs are defined by roles. In the recipe's I/O tab, the user can associate one
# or more dataset to each input and output role.
# Roles need to be defined in recipe.json, in the inputRoles and outputRoles fields.

input_dataset = get_input_names_for_role('main')[0]
# The dataset objects themselves can then be created like this:
df = dataiku.Dataset(input_dataset).get_dataframe()

# Note about typing:
# The configuration of the recipe is passed through a JSON object
# As such, INT parameters of the recipe are received in the get_recipe_config() dict as a Python float.
# If you absolutely require a Python int, use int(get_recipe_config()["my_int_param"])

#############################
# Your original recipe
#############################

#FILE_TOKEN=dataiku.get_custom_variables()['dip.home']+'/salesforce/sales_cloud_token.json'
FILE_TOKEN = get_recipe_config()['token']
COLUMNS = get_recipe_config()['COLUMNS']
OBJECT = get_recipe_config()['SF_OBJECT']

예제 #4

0

파일 보기

def get_club_histo_elo_flag(home_flag, club_id, dataNm):
    executor = dk.core.sql.SQLExecutor2(dataset=dk.Dataset(dataNm))
    mess = footbet_lstm_elo_flag(home_flag, club_id, dataNm)
    d = executor.query_to_df(mess)
    return d.values

예제 #5

0

파일 보기

import dataiku
import pandas as pd, numpy as np
from sklearn import *
from dataiku import pandasutils as pdu


# # A) Analyze and generate the features for each dataset

# In this step, I firstly analyzed the datasets shops, items and item categories and I observed that there was one common characteristic. This characteristic was that all of these datasets were basically lists of objects (store, product and category) and I believed that I needed to learn from the description of each line, so I generated more features based on the text column to get some insights. 

# For example, in the item category dataset, at the beginning I only had 2 columns which indicate the name and the ID, as it is seen in the table below.

# In[12]:

#loading the dataset
item_categories = dataiku.Dataset("item_categories")
item_categories_df = item_categories.get_dataframe()
item_categories_df.head()


# After analyzing this dataset, I added some other features such as the length of the category name, the number of words that the category name has, and some tfid columns that will help the computer to know the importance of a word among all the category names and through this way the computer learns something from them. 

# In[13]:

feature_cnt = 25 #define the maximum number of features I want to generate


# In[14]:

#Text Features for the item_categories dataset
tfidf = feature_extraction.text.TfidfVectorizer(max_features=feature_cnt)

예제 #6

0

파일 보기

파일: notebook editor for compute_output.py 프로젝트: KBean/mycode

# coding: utf-8

# In[ ]:



import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
airline_tweets_unique = dataiku.Dataset("airline_tweets_unique")
airline_tweets_unique_df = airline_tweets_unique.get_dataframe()

##test
##v2

# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

output_df = airline_tweets_unique_df # For this sample code, simply copy input to output


# Write recipe outputs
output = dataiku.Dataset("output")
output.write_with_schema(output_df)

예제 #7

0

파일 보기

def get_club_histo_defence_form_test(club_id, match_dt, w, dataNm):
    executor = dk.core.sql.SQLExecutor2(dataset=dk.Dataset(dataNm))
    mess = footbet_lstm_defence_form_test(club_id, match_dt, w, dataNm)
    d = executor.query_to_df(mess)
    return d.values

예제 #8

0

파일 보기

파일: recipe.py 프로젝트: dataiku/dss-plugin-multisheet-excel-export

input_config = get_recipe_config()
workbook_name = input_config.get('output_workbook_name', None)

if workbook_name is None:
    logger.warning(
        "Received input received recipe config: {}".format(input_config))
    raise ValueError('Could not read the workbook name.')

output_file_name = '{}.xlsx'.format(workbook_name)

try:
    validate_filename(output_file_name)
except ValidationError as e:
    raise ValueError(f"{e}\n")

tmp_file_helper = CustomTmpFile()
tmp_file_path = tmp_file_helper.get_temporary_cache_file(output_file_name)
logger.info(
    "Intend to write the output xls file to the following location: {}".format(
        tmp_file_path))

dataframes_to_xlsx(input_datasets_names, tmp_file_path,
                   lambda name: dataiku.Dataset(name).get_dataframe())

with open(tmp_file_path, 'rb', encoding=None) as f:
    output_folder.upload_stream(output_file_name, f)

tmp_file_helper.destroy_cache()

logger.info("Ended recipe processing.")

예제 #9

0

파일 보기

파일: backend.py 프로젝트: will-hill/AI_Boost_Validate

import numpy as np
import re


if "objects" not in get_webapp_config():
    raise ValueError("Objects folder not specified. Go to settings tab.")
if "frames" not in get_webapp_config():
    raise ValueError("Frames folder not specified. Go to settings tab.")
if "dataset" not in get_webapp_config():
    raise ValueError("Output dataset not specified. Go to settings tab.")

dataset_name = get_webapp_config()["dataset"]
objects_id = get_webapp_config()["objects"]
frames_id = get_webapp_config()["frames"]

dataset = dataiku.Dataset(dataset_name)
objects = dataiku.Folder(objects_id)
frames = dataiku.Folder(frames_id)

try:
    current_schema = dataset.read_schema()
    current_schema_columns = [c['name'] for c in current_schema]
except:
    current_schema_columns = ["path", "class", "comment"]
    dataset.write_schema([{"name": "path", "type": "string"}, {"name": "class", "type": "string"}, {"name": "comment", "type": "string"}])
    
if 'path' not in current_schema_columns or 'class' not in current_schema_columns or 'comment' not in current_schema_columns:
    raise ValueError("The target dataset should have columns: 'path', 'class' and 'comment'. Please edit the schema in the dataset settings.")

try:
    current_df = dataset.get_dataframe()

예제 #10

0

파일 보기

파일: recipes-helper.py 프로젝트: waimunThales/dataiku-contrib

def get_label_dataset(inputs):
    label_dataset_full_name = get_input_name_from_role(inputs, "label_dataset")
    label_dataset = dataiku.Dataset(label_dataset_full_name)
    return label_dataset

예제 #11

0

파일 보기

파일: Data Cleaning.py 프로젝트: Femalodedra/Hackathon2021

import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
m_query_20200512 = dataiku.Dataset("MQuery_05152020_RK")
m_query_20200512_df = m_query_20200512.get_dataframe()

# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

m_query_clean_df = m_query_20200512_df  # For this sample code, simply copy input to output

m_query_clean_df.head(10)

import pandas as pd

# Preserve only those rows having the GoalType = 'Target'
m_query_clean_df = m_query_clean_df.loc[m_query_clean_df['GoalType'] ==
                                        'Target']
set(list(m_query_clean_df['GoalType']))

# Total number of unique clients and authors
print("Unique Clients: ", m_query_clean_df['ClientId'].nunique())
print("Unique Authors: ", m_query_clean_df['TrialAuthorId'].nunique())

# Clean GoalMetDate column
dataset_new = m_query_clean_df[
    m_query_clean_df['GoalMetDate'].map(type) != float]
dataset_new = dataset_new[

예제 #12

0

파일 보기

# -*- coding: utf-8 -*-
"""
Created on Fri Feb  8 11:11:12 2019

@author: Hakim Razzak
"""

# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import matplotlib.pyplot as plt

# Read recipe inputs
dataset_data_attr_activity_filtered2_6months = dataiku.Dataset("dataset_data_attr_activity_filtered2_6months")
dataset_df = dataset_data_attr_activity_filtered2_6months.get_dataframe()


df = dataset_df[dataset_df['dim_country'] == 'US']
df = df[df['dim_app'].str.contains('Alizoha')]
df = df.rename(columns=lambda x: x.replace('_sum', ''))
df = df.drop('avg_dau', 1)
df = df.sort_values(by = 'week', ascending=False)
df


df_train = df.values[0:8,]
df_train = pd.DataFrame(df_train)
df_train

예제 #13

0

파일 보기

    'nb_features_per_output': P_FEATURE_SELECTION_NB_FIELD_PER_OUTPUT,
    'unique_file': P_GEN_UNIQUE_FILE,
    'only_matching_level': P_GENERATE_ALL_THE_CENSUS_LEVEL,
    'target': P_TARGET,
    'imputing': P_strategy,
    'imputing_threshold': P_threshold,
    'rescale': P_RESCALE
}

df_log_ = common.log__step('0', params, process_date, '', 0, '', 'init')

#----------------------------------------- INPUT DATASET

print '0/6 Processing input dataset...'

df = dataiku.Dataset(input_).get_dataframe(columns=columns)
if P_COLUMN_STATES_LOWER is True:
    df[P_COLUMN_STATES] = df[P_COLUMN_STATES].map(lambda x: x.lower())

print 'Creating States list...'
state_list_ = list(np.unique(df[P_COLUMN_STATES]))

state_conversion = common.state_to_2letters_format(P_STATES_TYPE_NAME,
                                                   state_list_)

state_list = state_conversion[0]
state_list_rejected = state_conversion[1]
dict_states = state_conversion[2]

s_found = len(state_list)
s_rejected = len(state_list_rejected)

예제 #14

0

파일 보기

파일: recipe.py 프로젝트: alexcombessie/dss-plugin-google-cloud-nlp

# ==============================================================================

api_configuration_preset = get_recipe_config().get("api_configuration_preset")
if api_configuration_preset is None or api_configuration_preset == {}:
    raise ValueError("Please specify an API configuration preset")
service_account_key = api_configuration_preset.get("gcp_service_account_key")
api_quota_rate_limit = api_configuration_preset.get("api_quota_rate_limit")
api_quota_period = api_configuration_preset.get("api_quota_period")
parallel_workers = api_configuration_preset.get("parallel_workers")
text_column = get_recipe_config().get("text_column")
text_language = get_recipe_config().get("language", "").replace("auto", "")
sentiment_scale = get_recipe_config().get("sentiment_scale")
error_handling = ErrorHandlingEnum[get_recipe_config().get("error_handling")]

input_dataset_name = get_input_names_for_role("input_dataset")[0]
input_dataset = dataiku.Dataset(input_dataset_name)
input_schema = input_dataset.read_schema()
input_columns_names = [col["name"] for col in input_schema]

output_dataset_name = get_output_names_for_role("output_dataset")[0]
output_dataset = dataiku.Dataset(output_dataset_name)

validate_column_input(text_column, input_columns_names)
input_df = input_dataset.get_dataframe()
client = get_client(service_account_key)
column_prefix = "sentiment_api"


# ==============================================================================
# RUN
# ==============================================================================

예제 #15

0

파일 보기

파일: plugin_config_loading.py 프로젝트: alexcombessie/dss-plugin-nlp-preparation

def load_plugin_config_spellchecker() -> Dict:
    """Utility function to validate and load spell checker parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    recipe_config = get_recipe_config()

    # input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify output dataset")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])

    # custom_vocabulary (optional input dataset)
    params["custom_vocabulary_set"] = set()
    custom_vocabulary_input = get_input_names_for_role("custom_vocabulary")
    if len(custom_vocabulary_input) != 0:
        custom_vocabulary_dataset = dataiku.Dataset(custom_vocabulary_input[0])
        params["custom_vocabulary_set"] = custom_vocabulary_checker(
            custom_vocabulary_dataset)
    logging.info(f"Custom vocabulary set: {params['custom_vocabulary_set']}")

    # custom_corrections (optional input dataset)
    params["custom_corrections"] = {}
    custom_corrections_input = get_input_names_for_role("custom_corrections")
    if len(custom_corrections_input) != 0:
        custom_corrections_dataset = dataiku.Dataset(
            custom_corrections_input[0])
        params["custom_corrections"] = custom_corrections_checker(
            custom_corrections_dataset)
    logging.info(f"Custom corrections: {params['custom_corrections']}")

    # diagnosis dataset (optional output dataset)
    diagnosis_dataset_names = get_output_names_for_role("diagnosis_dataset")
    params["diagnosis_dataset"] = None
    params["compute_diagnosis"] = False
    if len(diagnosis_dataset_names) != 0:
        logging.info("Spellchecker diagnosis will be computed")
        params["compute_diagnosis"] = True
        params["diagnosis_dataset"] = dataiku.Dataset(
            diagnosis_dataset_names[0])
    else:
        logging.info("Spellchecker diagnosis will not be computed")

    # path to the folder of stopwords
    params["stopwords_folder_path"] = os.path.join(get_recipe_resource(),
                                                   "stopwords")

    # path to the folder of dictionaries
    params["dictionary_folder_path"] = os.path.join(get_recipe_resource(),
                                                    "dictionaries")

    # Text column selection
    params["text_column"] = recipe_config.get("text_column")
    logging.info(f"Text column: {params['text_column']}")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")

    # Language selection
    params["language"] = recipe_config.get("language")
    if params["language"] == "language_column":
        params["language_column"] = recipe_config.get("language_column")
        if params["language_column"] not in input_dataset_columns:
            raise PluginParamValidationError(
                f"Invalid language column selection: : {params['language_column']}"
            )
        logging.info(f"Language column: {params['language_column']}")
    else:
        if not params["language"]:
            raise PluginParamValidationError("Empty language selection")
        if params["language"] not in SUPPORTED_LANGUAGES_SYMSPELL:
            raise PluginParamValidationError(
                f"Unsupported language code: {params['language']}")
        params["language_column"] = ""
        logging.info(f"Language: {params['language']}")

    # Expert mode
    if recipe_config.get("expert"):
        logging.info("Expert mode is enabled")
    else:
        logging.info("Expert mode is disabled")

    # edit distance
    params["edit_distance"] = recipe_config.get("edit_distance")
    if params["edit_distance"] < 2 or params["edit_distance"] > 100:
        raise PluginParamValidationError(
            "Edit distance must be between 2 and 100")
    logging.info(f"Maximum edit distance: {params['edit_distance']}")

    # ignore token
    if len(recipe_config.get("ignore_word_regex")) == 0:
        logging.info("No regular expression for words not to be corrected")
        params["ignore_word_regex"] = None  # symspellpy wants None
    else:
        params["ignore_word_regex"] = recipe_config.get("ignore_word_regex")
        # Check for valid regex
        try:
            ignore_token_compiled = re.compile(params["ignore_word_regex"])
        except re.error as e:
            raise PluginParamValidationError(
                f"Ignore pattern parameter is not a valid regex: {e}")
        params["ignore_word_regex"] = ignore_token_compiled.pattern
        logging.info(
            f"Regular expression for words not to be corrected: {params['ignore_word_regex']}"
        )

    return params

예제 #16

0

파일 보기

import pandas as pd
import logging
# Import smtplib for the actual sending function
import smtplib
# Import the email modules we'll need
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
import StringIO

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# Get handles on datasets
output_A_names = get_output_names_for_role('output')
output = dataiku.Dataset(
    output_A_names[0]) if len(output_A_names) > 0 else None

people = dataiku.Dataset(get_input_names_for_role('contacts')[0])
attachments = [
    dataiku.Dataset(x) for x in get_input_names_for_role('attachments')
]

# Read configuration
config = get_recipe_config()

recipient_column = config.get('recipient_column', None)
recipient_value = config.get('recipient_value', None)
sender_column = config.get('sender_column', None)
sender_value = config.get('sender_value', None)
subject_column = config.get('subject_column', None)
subject_value = config.get('subject_value', None)

예제 #17

0

파일 보기

파일: plugin_config_loading.py 프로젝트: alexcombessie/dss-plugin-nlp-preparation

def load_plugin_config_cleaning() -> Dict:
    """Utility function to validate and load text cleaning parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    recipe_config = get_recipe_config()

    # input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify output dataset")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])

    # path to the folder of stopwords
    params["stopwords_folder_path"] = os.path.join(get_recipe_resource(),
                                                   "stopwords")

    # Text column selection
    params["text_column"] = recipe_config.get("text_column")
    logging.info(f"Text column: {params['text_column']}")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")

    # Language selection
    params["language"] = recipe_config.get("language")
    if params["language"] == "language_column":
        params["language_column"] = recipe_config.get("language_column")
        if params["language_column"] not in input_dataset_columns:
            raise PluginParamValidationError(
                f"Invalid language column selection: {params['language_column']}"
            )
        logging.info(f"Language column: {params['language_column']}")
    else:
        if not params["language"]:
            raise PluginParamValidationError("Empty language selection")
        if params["language"] not in SUPPORTED_LANGUAGES_SPACY:
            raise PluginParamValidationError(
                f"Unsupported language code: {params['language']}")
        params["language_column"] = ""
        logging.info(f"Language: {params['language']}")

    # Cleaning parameters
    params["token_filters"] = set(recipe_config.get("token_filters", []))
    available_token_filters = set(
        MultilingualTokenizer.DEFAULT_FILTER_TOKEN_ATTRIBUTES.keys())
    if not params["token_filters"] <= available_token_filters:
        raise PluginParamValidationError(
            f"Invalid token filters: {params['token_filters']-available_token_filters}"
        )
    logging.info(f"Token filters: {params['token_filters']}")
    params["lemmatization"] = bool(recipe_config.get("lemmatization"))
    logging.info(f"Lemmatization: {params['lemmatization']}")
    params["lowercase"] = bool(recipe_config.get("lowercase"))
    logging.info(f"Lowercase: {params['lowercase']}")

    # Expert mode
    if recipe_config.get("expert"):
        logging.info("Expert mode is enabled")
    else:
        logging.info("Expert mode is disabled")
    params["unicode_normalization"] = UnicodeNormalization[recipe_config.get(
        "unicode_normalization")]
    logging.info(f"Unicode normalization: {params['unicode_normalization']}")

    params["keep_filtered_tokens"] = bool(
        recipe_config.get("keep_filtered_tokens"))
    logging.info(f"Keep filtered tokens: {params['keep_filtered_tokens']}")

    return params

예제 #18

0

파일 보기

import dataiku
from dataiku.customrecipe import *
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import matplotlib 
import wordcloud
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import dataiku.insights

#############################
# Recipe parameter import
#############################

input_dataset_name = get_input_names_for_role('input_dataset')[0]
df = dataiku.Dataset(input_dataset_name).get_dataframe()

output_folder_name = get_output_names_for_role('output_folder')[0]
folder = dataiku.Folder(output_folder_name)
wordcloud_path = folder.get_path()

static_insight_id = get_recipe_config()['static_insight_id']
text_column = get_recipe_config()['text_column']

#############################
# Code of the recipe
#############################

## wordcloud creation
wc = wordcloud.WordCloud(
        background_color='white',

예제 #19

0

파일 보기

파일: recipe.py 프로젝트: mbarreaucombeau/mailjet_connector

#campStartdt = get_input_names_for_role('start_date')[0]
campStartdt=get_recipe_config().get('start_date', '2016:01:01')
mjt.campStartdt=campStartdt

# Get your environment Mailjet keys
auth, saname, said = mjt.readSa()
#create empty lists
d_co_dflst=[]#will contain the list of campaign overview dataframe
co_d_count=0
print '##############get campaign data############################'
sa_dict=dict()
for sa in range(len(auth)):
    mj = Client(auth=auth[sa])
    sa_dict[saname[sa]]=auth[sa][0]
    co_d_count, df_co=mjt.getContactData(mj)
    if co_d_count > 0:
        df_co['SubAccount']=saname[sa]
        d_co_dflst.append(df_co)
#union of all dataframes    
if co_d_count > 0:
    df=pd.concat(d_co_dflst)

    df["SubAccount_PK"]=df["SubAccount"].map(lambda x : sa_dict[x])
    #and we write the resulting dataset to dataiku

    # Recipe outputs
    # Recipe outputs for contact list
    main_output_name = get_output_names_for_role('customers')[0]
    output_dataset =  dataiku.Dataset(main_output_name)
    output_dataset.write_with_schema(df)

예제 #20

0

파일 보기

파일: compute_pve_geocoded_bano_1.py 프로젝트: CartAV/VERBALISATIONLECTRONIQUEPARTAGE

            'type': 'float' if column in floats else 'string'
        })
    ods.write_schema(schema)
    ow = ods.get_writer()
    # Then the full pass
    dataset_iter = ids.iter_dataframes(chunksize=lines_per_request,
                                       infer_with_pandas=False,
                                       limit=limit)
    process_queue = Queue(threads)
    write_queue = Queue()
    for i, chunk in enumerate(dataset_iter):
        process_queue.put(i)
        thread = Process(
            target=process_chunk,
            args=[i, chunk, process_queue, write_queue, output_index])
        thread.start()
        while write_queue.qsize() > 0:
            ow.write_dataframe(write_queue.get())

    print("Waiting  for{} chunk processes".format(process_queue.qsize()))
    while process_queue.qsize() > 0:
        time.sleep(1)
    print("Flushing {} chunks".format(write_queue.qsize()))
    while write_queue.qsize() > 0:
        ow.write_dataframe(write_queue.get())
    ow.close()


ids = dataiku.Dataset(IDS_NAME)
ods = dataiku.Dataset(ODS_NAME)
geocode(ids, ods)

예제 #21

0

파일 보기

def get_club_histo_attack_global_test(club_id, dataNm):
    executor = dk.core.sql.SQLExecutor2(dataset=dk.Dataset(dataNm))
    mess = footbet_lstm_goal_attack_test(club_id, dataNm)
    d = executor.query_to_df(mess)
    return d.values

예제 #22

0

파일 보기

파일: recipe.py 프로젝트: vhoudebine/dataiku-contrib

# -*- coding: utf-8 -*-
import pandas as pd
import tmdbsimple as tmdb
import requests, time
import dataiku
from dataiku.customrecipe import *

tmdb.API_KEY = get_plugin_config()['tmdb_api_key']

input_dataset = dataiku.Dataset(get_input_names_for_role('input_dataset')[0])
title_col = get_recipe_config().get('title_col')
release_date_col = get_recipe_config().get('release_date_col',None)
results = []
results_notFound = []

for row in input_dataset.iter_rows(log_every=10):
    title = row[title_col]
    print "looking up", title.encode('utf-8')
    try:
        response = tmdb.Search().movie(query=title)["results"]
        time.sleep(0.05)
    except requests.exceptions.HTTPError as e:
        print 'Error:', e
        results_notFound.append({'title_queried': title, 'error': e})
        continue
    if len(response) == 0:
        print 'Error: no matches'
        results_notFound.append({'title_queried': title, 'error': 'no matches'})
        continue
    movies = pd.DataFrame(response)
    # choose best match

예제 #23

0

파일 보기

파일: recipe.py 프로젝트: renchristelle/Predicting-Gas-Station-Prices

currency_base = get_recipe_config()['currency_base']
currency_to = get_recipe_config()['currency_to']

# output using the core code
url = "https://api.exchangeratesapi.io/history"

querystring = {"start_at":start_date,"end_at":end_date,"base":currency_base,"symbols":currency_to}

print('QUERY ----', querystring)

payload = ""
headers = ""

response = requests.request("GET", url, data=payload, headers="", params=querystring)

dates = response.json()['rates'].keys()
rates = []

currency_from = querystring['base']
currency_to = querystring['symbols']

for i in range(len(dates)):
    rates.append(response.json()['rates'].values()[i].values()[0])

df = pd.DataFrame({'day':dates,'exchange_rate_%s_to_%s'%(currency_from,currency_to):rates}).sort_values('day',ascending=False).reset_index(drop=True)

df['day'] = pd.to_datetime(df['day'])

# Write recipe outputs
add_exchange_rates = dataiku.Dataset(output)
add_exchange_rates.write_with_schema(df)

예제 #24

0

파일 보기

import urllib
import time
from dataiku.customrecipe import *
import sys

# disable InsecureRequestWarning.
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

print ('## Running Plugin v0.5.0 ##')

input_name = get_input_names_for_role('input')[0]

# Recipe out

output_ = get_output_names_for_role('output')[0]
output_dataset = dataiku.Dataset(output_)

schema = [
    {'name': 'matchedAddress', 'type': 'string'}
    , {'name': 'latitude', 'type': 'double'}
    , {'name': 'longitude', 'type': 'double'}
    , {'name': 'tigerLineId', 'type': 'string'}
    , {'name': 'side', 'type': 'string'}
    , {'name': 'preDirection', 'type': 'string'}
    , {'name': 'streetName', 'type': 'string'}
    , {'name': 'suffixType', 'type': 'string'}
    , {'name': 'suffixDirection', 'type': 'string'}
    , {'name': 'city', 'type': 'string'}
    , {'name': 'state', 'type': 'string'}
    , {'name': 'zip', 'type': 'string'}
]

예제 #25

0

파일 보기

파일: recipe.py 프로젝트: waimunThales/dataiku-contrib

                "%a %b %d %H:%M:%S %Y"))

        o["id_str"] = response.data["id_str"]
        o["favourites_count"] = response.data["favourites_count"]
        o["geo_enabled"] = response.data["geo_enabled"]
        o["statuses_count"] = response.data["statuses_count"]
        o["following"] = response.data["following"]
        o["follow_request_sent"] = response.data["follow_request_sent"]
        o["notifications"] = response.data["notifications"]
        o["entities"] = json.dumps(response.data.entities)
        results.append(o)

        # calculate interval
        interval = calc_interval(response.headers)
    else:
        interval = DEFAULT_INTERVAL

    nb_done = nb_done + 1
    #if nb_done == 1:
    #    break

    print "Sleep " + str(interval) + " s"
    time.sleep(interval)

odf = pd.DataFrame(results)

if odf.size > 0:
    # Recipe outputs
    followers_info = dataiku.Dataset(output_dataset_name)
    followers_info.write_with_schema(odf)

예제 #26

0

파일 보기

파일: recipe.py 프로젝트: vhoudebine/dataiku-contrib

    'datacollection_search')
P_DERIVATIVE = get_recipe_config()['add_derivative']
P_RETURN_GEOMETRY = get_recipe_config()['return_geometry']

P_OPTION_DATA_AS_TRANSACTIONS = get_recipe_config(
)['store_enrichment_as_key_value']
P_ACTIVATE_BACKUP = get_recipe_config()['activate_backup']
P_EPSG_IN_SR = int(get_recipe_config()['in_sr'])

input_name = get_input_names_for_role('input')[0]

# If we have a data catalog, search data collections from it
input_catalog_names = get_input_names_for_role('datacollections')
df_datacollections = None
if len(input_catalog_names) > 0:
    df_datacollections = dataiku.Dataset(
        input_catalog_names[0]).get_dataframe()
    print "Input catalog enabled total_coll=%s specific=%s search=%s" % (
        df_datacollections.shape[0], P_SPECIFIC_DATACOLLECTION_ID,
        P_DATACOLLECTIONS_SEARCH)

    if P_SPECIFIC_DATACOLLECTION_ID is not None:
        df_datacollections1 = df_datacollections[
            df_datacollections['collection_id'] ==
            P_SPECIFIC_DATACOLLECTION_ID]
        nb_records_df_datacollections1 = df_datacollections1.shape[0]
        print 'info :  Before country filtering nb records in data collections dataset (specific collection) : %s' % (
            nb_records_df_datacollections1)

    if P_DATACOLLECTIONS_SEARCH is not None:
        df_datacollections2 = df_datacollections[
            df_datacollections['collection_long_description'].str.contains(

예제 #27

0

파일 보기

파일: compute_out1.py 프로젝트: dlabouesse/dataiku-git-projects-external-user-test

# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
new_customers = dataiku.Dataset("new_customers")
new_customers_df = new_customers.get_dataframe()

# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

out1_df = new_customers_df  # For this sample code, simply copy input to output

# Write recipe outputs
out1 = dataiku.Dataset("out1")
out1.write_with_schema(out1_df)

예제 #28

0

파일 보기

파일: recipe.py 프로젝트: dataiku/dss-plugin-salesforce

# -*- coding: utf-8 -*-
import dataiku
from dataiku.customrecipe import get_output_names_for_role, get_recipe_config
import json
import requests
import datetime

# Output
output_name = get_output_names_for_role('main')[0]
output = dataiku.Dataset(output_name)
output.write_schema([{
    'name': 'datetime',
    'type': 'string'
}, {
    'name': 'status_code',
    'type': 'string'
}, {
    'name': 'result',
    'type': 'string'
}, {
    'name': 'path_file',
    'type': 'string'
}])

# Read configuration
config = get_recipe_config()
client_id = config.get('client_id', '')
client_secret = config.get('client_secret', '')
email = config.get('email', '')
password = config.get('password', '')
sandbox = config.get('sandbox', False)

예제 #29

0

파일 보기

파일: python_split.py 프로젝트: ivorydaae/DSS

# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
fico_join = dataiku.Dataset("fico_join")
fico_join_df = fico_join.get_dataframe()

# Compute recipe outputs
# TODO: Write here your actual code that computes the outputs
# NB: DSS supports several kinds of APIs for reading and writing data. Please see doc.

fico_historic_df = fico_join_df[fico_join_df['Default'].notnull() == True]
fico_to_predict_df = fico_join_df[fico_join_df['Default'].isnull() == True]

# Write recipe outputs
fico_historic = dataiku.Dataset("fico_historic")
fico_historic.write_with_schema(fico_historic_df)
fico_to_predict = dataiku.Dataset("fico_to_predict")
fico_to_predict.write_with_schema(fico_to_predict_df)

예제 #30

0

파일 보기

                partnerDetails = getPartnerDetails(f['partner_function'], map)
                if partnerDetails != None:
                    d['partnerFunction'] = partnerDetails
            # print('Working partner check')
            # print(d)
            choices.append(d)
            print('=====================================end of ' +
                  fle.get("coprocessor") + '================================')
        except ValueError, e:
            logging.info("file is not valid json")

    # print('=====================================end of ' + fle.get("coprocessor") + '================================')

    # Get input table metadata.
    input_table_name = inputs[0]['fullName'].split('.')[1]
    input_dataset = dataiku.Dataset(input_table_name)
    schema = input_dataset.read_schema()

    # print('was able to obtain input schemas')

    inputschemas = {}
    for inputdataset in inputs:
        inputtablename = inputdataset['fullName'].split('.')[1]
        inputdataset = dataiku.Dataset(inputtablename)
        inputschemas[inputtablename] = inputdataset.read_schema()

    # print('was able to forloop input schemas')

    # AAF schema from connection details
    connection = getConnectionParamsFromDataset(input_dataset)
    aafschema = ([property.get('value', '') for property in connection.\