Exemplo n.º 1
0
 def test_append_if_exists(self):
     client = self.connection.client
     # first call
     to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append')
     # second call
     to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append')
     client.create_log_table.assert_called_once_with('test_db', 'test_table')
Exemplo n.º 2
0
    def run(self):
        import boto3
        import matplotlib as mlp
        mlp.use('agg')
        from matplotlib import pyplot as plt
        import pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        fig1 = model.plot(forecast)
        fig2 = model.plot_components(forecast)
        predict_fig_data = io.BytesIO()
        component_fig_data = io.BytesIO()
        fig1.savefig(predict_fig_data, format='png')
        fig2.savefig(component_fig_data, format='png')
        predict_fig_data.seek(0)
        component_fig_data.seek(0)

        # Upload figures to S3
        # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY":
        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables
        s3 = boto3.resource('s3')

        predicted_fig_file = "predicted.png"
        component_fig_file = "component.png"

        # ACL should be chosen with your purpose
        s3.Object(os.environ['S3_BUCKET'],
                  predicted_fig_file).put(ACL='public-read',
                                          Body=predict_fig_data,
                                          ContentType='image/png')
        s3.Object(os.environ['S3_BUCKET'],
                  component_fig_file).put(ACL='public-read',
                                          Body=component_fig_data,
                                          ContentType='image/png')

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Exemplo n.º 3
0
    def run(self, with_aws=True):
        import pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        if with_aws:
            self._upload_graph(model, forecast)

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Exemplo n.º 4
0
def write_td_table(database_name, table_name):
    import pandas as pd
    import random
    # TODO TD client, check for table's existence
    engine = td.create_engine(f"presto:{database_name}", con=con)
    df = pd.DataFrame({"c": [random.random() for _ in range(20)]})

    # Manipulating data in Treasure Data via Python.
    # Uses https://github.com/treasure-data/td-client-python

    tdc = tdclient.Client(apikey=os.environ['TD_API_KEY'],
                          endpoint=os.environ['TD_API_SERVER'])

    try:
        tdc.create_database(database_name)
    except AlreadyExistsError:
        pass

    try:
        tdc.create_log_table(database_name, table_name)
    except AlreadyExistsError:
        pass

    table_path = f"{database_name}.{table_name}"
    td.to_td(df, table_path, con, if_exists='replace', index=False)
Exemplo n.º 5
0
 def test_append_if_exists(self):
     client = self.connection.client
     # first call
     to_td(self.frame, "test_db.test_table", self.connection, if_exists="append")
     # second call
     to_td(self.frame, "test_db.test_table", self.connection, if_exists="append")
     client.create_log_table.assert_called_once_with("test_db", "test_table")
Exemplo n.º 6
0
 def test_datetime_is_not_supported(self):
     # mock
     client = self.connection.client
     client.table = MagicMock(side_effect=tdclient.api.NotFoundError('test_table'))
     client.create_log_table = MagicMock()
     client.import_data = MagicMock()
     # test
     frame = pd.DataFrame({'timestamp': [datetime.datetime(2000,1,1)]})
     to_td(frame, 'test_db.test_table', self.connection)
Exemplo n.º 7
0
 def test_replace_if_exists(self):
     client = self.connection.client
     # first call
     to_td(self.frame, "test_db.test_table", self.connection, if_exists="replace")
     client.create_log_table.assert_called_with("test_db", "test_table")
     # second call
     to_td(self.frame, "test_db.test_table", self.connection, if_exists="replace")
     client.delete_table.assert_called_with("test_db", "test_table")
     client.create_log_table.assert_called_with("test_db", "test_table")
Exemplo n.º 8
0
 def test_replace_if_exists(self):
     client = self.connection.client
     # first call
     to_td(self.frame, 'test_db.test_table', self.connection, if_exists='replace')
     client.create_log_table.assert_called_with('test_db', 'test_table')
     # second call
     to_td(self.frame, 'test_db.test_table', self.connection, if_exists='replace')
     client.delete_table.assert_called_with('test_db', 'test_table')
     client.create_log_table.assert_called_with('test_db', 'test_table')
Exemplo n.º 9
0
 def test_ok_if_not_exists(self):
     # mock
     client = self.connection.client
     client.table = MagicMock(side_effect=tdclient.api.NotFoundError('test_table'))
     client.create_log_table = MagicMock()
     client.import_data = MagicMock()
     # test
     to_td(self.frame, 'test_db.test_table', self.connection)
     client.table.assert_called_with('test_db', 'test_table')
     client.create_log_table.assert_called_with('test_db', 'test_table')
Exemplo n.º 10
0
 def test_append_if_exists(self):
     # mock
     client = self.connection.client
     client.table = MagicMock(side_effect=tdclient.api.NotFoundError('test_table'))
     client.create_log_table = MagicMock()
     client.import_data = MagicMock()
     # first call
     to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append')
     # second call
     client.table = MagicMock()
     to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append')
     client.create_log_table.assert_called_once_with('test_db', 'test_table')
Exemplo n.º 11
0
 def test_append_if_exists(self):
     client = self.connection.client
     # first call
     to_td(self.frame,
           'test_db.test_table',
           self.connection,
           if_exists='append')
     # second call
     to_td(self.frame,
           'test_db.test_table',
           self.connection,
           if_exists='append')
     client.create_log_table.assert_called_once_with(
         'test_db', 'test_table')
Exemplo n.º 12
0
 def test_replace_if_exists(self):
     client = self.connection.client
     # first call
     to_td(self.frame,
           'test_db.test_table',
           self.connection,
           if_exists='replace')
     client.create_log_table.assert_called_with('test_db', 'test_table')
     # second call
     to_td(self.frame,
           'test_db.test_table',
           self.connection,
           if_exists='replace')
     client.delete_table.assert_called_with('test_db', 'test_table')
     client.create_log_table.assert_called_with('test_db', 'test_table')
Exemplo n.º 13
0
    def jspca(self):
        os.system('pip install pandas')
        os.system('pip install scipy')
        os.system('pip install sklearn')
        os.system('pip install pandas-td')
        os.system('pip install pyyaml')

        from sklearn.decomposition import PCA
        import pandas as pd
        import pandas_td
        import yaml
        from scipy.spatial.distance import pdist, squareform
        from scipy.stats import entropy

        def _js(_P, _Q):
            _M = 0.5 * (_P + _Q)
            return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

        with open('config/params.yml') as f:
            params = yaml.load(f)

        apikey = os.environ.get("python_apikey")
        dbname = params['dbname']

        connection = pandas_td.connect(apikey=apikey)

        engine = pandas_td.create_engine('presto:{}'.format(dbname),
                                         con=connection)

        df = pandas_td.read_td(
            'select label, lambda from pca_input order by label asc', engine)

        pca = PCA(n_components=2, random_state=0)

        dist = []
        for index, row in df.iterrows():
            dist.append([0 if v is None else v for v in row['lambda'][2:]])

        dist_matrix = squareform(pdist(dist, metric=_js))

        result_df = pd.DataFrame(pca.fit_transform(dist_matrix),
                                 columns=['x', 'y'])

        pandas_td.to_td(result_df,
                        '{}.principal_component'.format(dbname),
                        connection,
                        if_exists='replace')
Exemplo n.º 14
0
 def test_invalid_if_exists(self):
     with self.assertRaises(ValueError):
         to_td(
             self.frame, "test_db.test_table", self.connection, if_exists="invalid"
         )
Exemplo n.º 15
0
 def test_invalid_table_name(self):
     with self.assertRaises(ValueError):
         to_td(self.frame, "invalid", self.connection)
Exemplo n.º 16
0
 def test_datetime_is_not_supported(self):
     client = self.connection.client
     # test
     frame = pd.DataFrame({"timestamp": [datetime.datetime(2000, 1, 1)]})
     with self.assertRaises(TypeError):
         to_td(frame, "test_db.test_table", self.connection)
Exemplo n.º 17
0
 def import_frame(self, frame, table):
     td.to_td(frame,
              self.database + '.' + table,
              self.conn,
              if_exists='replace',
              index=False)
Exemplo n.º 18
0
    def run(self):
        import pandas as pd
        import pandas_td as td
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        connection = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        dbname = self.dbname
        source_table = self.source_table

        engine = td.create_engine('presto:{}'.format(dbname), con=connection)

        # Fetch 25% random sampled data
        df = td.read_td(
            """
            select *
            from {} tablesample bernoulli(25)
            """.format(source_table), engine)
        # You can use Hive instead:
        #
        # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection)
        # df = td.read_td(
        #     """
        #     select *
        #     from {}_train
        #     where rnd < 0.25
        #     """.format(source_table),
        #     engine_hive
        # )
        df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore')

        y = df.medv
        X = df.drop(columns=['medv'])

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        reg = ExtraTreesRegressor()
        reg = reg.fit(X, y)

        feature_importances = pd.DataFrame({
            'column':
            X.columns,
            'importance':
            reg.feature_importances_
        })
        td.to_td(feature_importances,
                 'boston.feature_importances',
                 con=connection,
                 if_exists='replace',
                 index=False)

        model = SelectFromModel(reg, prefit=True)

        feature_idx = model.get_support()
        feature_name = df.drop(columns=['medv']).columns[feature_idx]
        selected_features = set(feature_name)

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        feature_types = {
            'categorical_columns': categorical_columns,
            'quantitative_columns': quantitative_columns
        }
        feature_query = self._feature_column_query(selected_features,
                                                   feature_types=feature_types)

        # Store query if possible
        try:
            import digdag
            digdag.env.store({'feature_query': feature_query})

        except ImportError:
            pass
Exemplo n.º 19
0
 def test_datetime_is_not_supported(self):
     client = self.connection.client
     # test
     frame = pd.DataFrame({'timestamp': [datetime.datetime(2000, 1, 1)]})
     to_td(frame, 'test_db.test_table', self.connection)
Exemplo n.º 20
0
 def test_datetime_is_not_supported(self):
     client = self.connection.client
     # test
     frame = pd.DataFrame({'timestamp': [datetime.datetime(2000,1,1)]})
     to_td(frame, 'test_db.test_table', self.connection)
Exemplo n.º 21
0
 def test_fail_if_exists(self):
     client = self.connection.client
     client.table = MagicMock()
     to_td(self.frame, 'test_db.test_table', self.connection)
Exemplo n.º 22
0
 def test_fail_if_exists(self):
     client = self.connection.client
     client.table = MagicMock()
     to_td(self.frame, 'test_db.test_table', self.connection)
Exemplo n.º 23
0
# Set API key and start a session
Set your API key to the environment variable TD_API_KEY and run "jupyter notebook":

$ export TD_API_SERVER="https://api.treasuredata.com/"
$ export TD_API_KEY="1234/abcd..."
$ jupyter notebook

con = td.connect(apikey=os.environ['TD_API_KEY'],
                 endpoint=os.environ['TD_API_SERVER'],
                 retry_post_requests=True)
engine = td.create_engine('presto:sample_datasets', con=con)


pip install pandas-td
import pandas_td as td
engine = td.create_engine('presto:sample_datasets')

# Alternatively, initialize a connection explicitly
con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER'])
engine = td.create_engine('presto:sample_datasets', con=con)

# con = td.connect()
with td.connect() as con:
    td.to_td(df, 'my_db.test_table', con, if_exists='replace', index=False)


# Import it into 'tutorial.import1'
con = td.connect()
td.to_td(df, 'tutorial.import1', con, if_exists='replace', index=False)
Exemplo n.º 24
0
 def test_invalid_if_exists(self):
     to_td(self.frame, 'test_db.test_table', self.connection, if_exists='invalid')
Exemplo n.º 25
0
 def test_invalid_table_name(self):
     to_td(self.frame, 'invalid', self.connection)
Exemplo n.º 26
0
 def test_invalid_table_name(self):
     to_td(self.frame, 'invalid', self.connection)
Exemplo n.º 27
0
def run(with_aws=True):
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    import sys
    os.system(f"{sys.executable} -m pip install pandas-td")
    os.system(
        f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1"
    )

    import tensorflow as tf
    import tensorflow_hub as hub
    import pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model to S3
        if with_aws:
            _upload_model(embedded_text_feature_column, estimator)

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)
Exemplo n.º 28
0
 def test_fail_if_exists(self):
     client = self.connection.client
     client.table = MagicMock()
     with self.assertRaises(RuntimeError):
         to_td(self.frame, "test_db.test_table", self.connection)
Exemplo n.º 29
0
 def test_invalid_if_exists(self):
     to_td(self.frame,
           'test_db.test_table',
           self.connection,
           if_exists='invalid')
Exemplo n.º 30
0
 def test_ok_if_not_exists(self):
     client = self.connection.client
     to_td(self.frame, "test_db.test_table", self.connection)
     client.table.assert_called_with("test_db", "test_table")
     client.create_log_table.assert_called_with("test_db", "test_table")
Exemplo n.º 31
0
#!/usr/bin/python

import os
import sys
import pandas as pd
import pandas_td as td

print "load.py started"

con = td.connect(apikey="TD_APIKEY", endpoint='https://api.treasuredata.com')

# Type: Presto, Database: sample_datasets
engine = td.create_engine('presto:sample_datasets', con=con)

# Read Treasure Data query from into a DataFrame.
df = td.read_td_query('''
SELECT time, close FROM nasdaq LIMIT 100
''',
                      engine,
                      index_col='time',
                      parse_dates={'time': 's'})

print df.head

# Output DataFrame to TreasureData via Streaming Import. (If your dataset is large, this method is not recommended.)
td.to_td(df, 'workflow_temp.test_emr', con, if_exists='replace', index=False)

print "load.py finished"
Exemplo n.º 32
0
def run():
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    #os.system("pip install pandas-td tensorflow_hub boto3")

    import boto3
    import tensorflow as tf
    import tensorflow_hub as hub
    import pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled    
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model on S3
        feature_spec = tf.feature_column.make_parse_example_spec(
            [embedded_text_feature_column])
        serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        estimator.export_saved_model(EXPORT_DIR_BASE,
                                     serving_input_receiver_fn)

        with tarfile.open('tfmodel.tar.gz', 'w:gz') as tar:
            tar.add(EXPORT_DIR_BASE, arcname=os.path.basename(EXPORT_DIR_BASE))

        # Upload the TensorFlow model to S3
        # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY":
        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables
        s3 = boto3.resource('s3')
        # ACL should be chosen with your purpose
        s3.Bucket(os.environ['S3_BUCKET']).upload_file('tfmodel.tar.gz',
                                                       'tfmodel.tar.gz')

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)
Exemplo n.º 33
0
 def test_ok_if_not_exists(self):
     client = self.connection.client
     to_td(self.frame, 'test_db.test_table', self.connection)
     client.table.assert_called_with('test_db', 'test_table')
     client.create_log_table.assert_called_with('test_db', 'test_table')