def bulk_load(data): dataframe = pandas.DataFrame(columns=[ 'time', 'job_id', 'type', 'query', 'status', 'created_at', 'start_at', 'org_name', 'database', 'user_name' ]) for item in data: record = pandas.Series([ item['time'], item['job_id'], item['type'], item['query'], item['status'], item['created_at'], item['start_at'], item['org_name'], item['database'], item['user_name'] ], index=dataframe.columns) dataframe = dataframe.append(record, ignore_index=True) jar_path = TDSparkContextBuilder.default_jar_path() writer = SparkWriter(apikey=TD_API_KEY, endpoint=TD_API_SERVER, td_spark_path=jar_path) with pytd.Client(apikey=TD_API_KEY, endpoint=TD_API_SERVER, database=TD_DATABASE, writer=writer) as client: client.load_table_from_dataframe(dataframe, TD_TABLE, if_exists='append')
def _prepare_td_spark() -> TDSparkContext: """ Create SparkSession with local mode setting td-spark specific configurations. :return: TDSparkContext """ apikey = os.environ["TD_API_KEY"] endpoint = os.environ["TD_API_SERVER"] site = "us" if ".co.jp" in endpoint: site = "jp" elif "eu01" in endpoint: site = "eu01" builder = SparkSession.builder.appName("spark_als") td = (TDSparkContextBuilder(builder).apikey(apikey).site(site).jars( TDSparkContextBuilder.default_jar_path()).build()) return td
def rss_import(dest_db: str, dest_table: str, rss_url_list): df = pd.DataFrame( columns=['title','description','link'] ) ts = str(int(time.time())) for rss_url in rss_url_list: d = feedparser.parse(rss_url) for entry in d.entries: tmp_se = pd.Series( [ entry.title, entry.description, entry.link ], index=df.columns ) df = df.append( tmp_se, ignore_index=True ) #print(df) jar_path = TDSparkContextBuilder.default_jar_path() writer = SparkWriter(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, td_spark_path=jar_path) client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, database=dest_db, writer=writer, engine='presto') client.load_table_from_dataframe(df, dest_table, if_exists='append')
def get_records(api, basic, org, app_id, database, table, fields, query, id_field_code): # APIリスト読み込み api_list = eval(api) # TDへのコネクションを作成 writer = SparkWriter( td_spark_path=TDSparkContextBuilder.default_jar_path()) con = td.connect(writer=writer) # アプリ番号でループ for a in api_list: # app_idでアプリを指定 if a["id"] == app_id: # kintone APIの設定 url = f"https://{org}.cybozu.com/k/v1/records.json" headers = {"X-Cybozu-API-Token": a["key"], "Authorization": basic} payload = { "app": 1, "query": query, "fields": fields, "totalCount": "true" } r = requests.get(url, headers=headers, params=payload) count = int(json.loads(r.text)["totalCount"]) print(count) # GETしたデータをキャッシュするdf for i in itertools.islice(range(0, count), 0, None, 100): splited_query = (query + " order by " + id_field_code + " asc limit 100 offset " + f"{i}") print(splited_query) payload = {"app": 1, "query": splited_query, "fields": fields} r = requests.get(url, headers=headers, params=payload) if r.status_code != 200: sys.exit(1) else: data = json.loads(r.text) df = pd.DataFrame.from_dict(data) df = json_normalize(df["records"]) df = df.rename(columns=column_encode) # KintoneからGETしたアプリID = X のrecordsをTDのTableに格納 td.to_td( df, ".".join([database, table]), con, if_exists="append", index=False, )
def bulk_load(data): dataframe = pandas.DataFrame(columns=mp.keys()) for item in data: record = pandas.Series(list(item.values()), index=dataframe.columns) dataframe = dataframe.append(record, ignore_index=True) jar_path = TDSparkContextBuilder.default_jar_path() writer = SparkWriter(apikey=TD_API_KEY, endpoint=TD_API_SERVER, td_spark_path=jar_path) with pytd.Client(apikey=TD_API_KEY, endpoint=TD_API_SERVER, database=TD_DATABASE, writer=writer) as client: client.load_table_from_dataframe(dataframe, TD_TABLE, if_exists='append')
def get_row_count(dest_db: str, dest_table: str): df = pd.DataFrame(columns=['db_name', 'table_name', 'row_count']) jar_path = TDSparkContextBuilder.default_jar_path() writer = SparkWriter(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, td_spark_path=jar_path) client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, database=dest_db, writer=writer, engine='presto') for db in client.list_databases(): for table in client.list_tables(db.name): tmp_se = pd.Series([db.name, table.name, table.count], index=df.columns) df = df.append(tmp_se, ignore_index=True) #print(db.name + ',' + table.name + ',' + str(table.count)) #print(df) client.load_table_from_dataframe(df, dest_table, if_exists='append')
def fetch_td_spark_context( apikey=None, endpoint=None, td_spark_path=None, download_if_missing=True, spark_configs=None, ): """Build TDSparkContext via td-pyspark. Parameters ---------- apikey : str, optional Treasure Data API key. If not given, a value of environment variable ``TD_API_KEY`` is used by default. endpoint : str, optional Treasure Data API server. If not given, ``https://api.treasuredata.com`` is used by default. List of available endpoints is: https://tddocs.atlassian.net/wiki/spaces/PD/pages/1085143/Sites+and+Endpoints td_spark_path : str, optional Path to td-spark-assembly-{td-spark-version}_spark{spark-version}.jar. If not given, seek a path ``TDSparkContextBuilder.default_jar_path()`` by default. download_if_missing : bool, default: True Download td-spark if it does not exist at the time of initialization. spark_configs : dict, optional Additional Spark configurations to be set via ``SparkConf``'s ``set`` method. Returns ------- :class:`td_pyspark.TDSparkContext` Connection of td-spark """ try: import td_pyspark from pyspark.conf import SparkConf from pyspark.sql import SparkSession from td_pyspark import TDSparkContextBuilder except ImportError: raise RuntimeError("td_pyspark is not installed") apikey = apikey or os.environ.get("TD_API_KEY") if apikey is None: raise ValueError("either argument 'apikey' or environment variable" "'TD_API_KEY' should be set") if endpoint is None: endpoint = os.getenv("TD_API_SERVER", "https://api.treasuredata.com") conf = (SparkConf().setMaster("local[*]").set( "spark.serializer", "org.apache.spark.serializer.KryoSerializer").set( "spark.sql.execution.arrow.pyspark.enabled", "true")) if isinstance(spark_configs, dict): for k, v in spark_configs.items(): conf.set(k, v) builder = TDSparkContextBuilder(SparkSession.builder.config(conf=conf)) builder.apikey(apikey) if td_spark_path is None: td_spark_path = TDSparkContextBuilder.default_jar_path() else: td_spark_path = os.path.expanduser(td_spark_path) available = os.path.exists(td_spark_path) if not available and download_if_missing: download_td_spark(version=td_pyspark.__version__, destination=td_spark_path) elif not available: raise IOError( "td-spark is not found and `download_if_missing` is False") builder.jars(td_spark_path) plazma_api = os.getenv("TD_PLAZMA_API") presto_api = os.getenv("TD_PRESTO_API") if plazma_api and presto_api: api_regex = re.compile(r"(?:https?://)?(api(?:-.+?)?)\.") builder.api_endpoint(api_regex.sub("\\1.", endpoint).strip("/")) builder.plazma_endpoint(plazma_api) builder.presto_endpoint(presto_api) site = "us" if ".co.jp" in endpoint: site = "jp" if "eu01" in endpoint: site = "eu01" if "ap02" in endpoint: site = "ap02" builder.site(site) try: return builder.build() except Exception as e: raise RuntimeError("failed to connect to td-spark: " + str(e))
def run_batch(database, input_table, output_table, device, model, vocab, setup, batchsize=64): def predict_batch(words_batch): xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config("train", False), chainer.no_backprop_mode(): probs = model.predict(xs, softmax=True) # Note: Prediction labels are different from original Chainer example # positive: 1, negative: 0 answers = model.xp.argmax(probs, axis=1) scores = probs[model.xp.arange(answers.size), answers].tolist() return answers, scores td_api_key = os.environ["TD_API_KEY"] endpoint = os.environ["TD_API_SERVER"] jar_path = TDSparkContextBuilder.default_jar_path() logger.info("Connect to Treasure Data") con = td.connect() presto = td.create_engine(f"presto:{database}", con=con) logger.info("Fetch data from Treasure Data") test_df = td.read_td( f""" select rowid, sentence, sentiment, polarity from {input_table} """, presto, ) sentences = test_df["sentence"].tolist() logger.info("Start prediction") batch = [] predicted = [] i = 1 for sentence in sentences: text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup["char_based"]) batch.append(words) if len(batch) >= batchsize: _predicted, _ = predict_batch(batch) predicted.append(_predicted) batch = [] logger.info(f"Predicted: {i}th batch. batch size {batchsize}") i += 1 if batch: _predicted, _ = predict_batch(batch) predicted.append(_predicted) logger.info("Finish prediction") test_df["predicted_polarity"] = numpy.concatenate(predicted, axis=None) # Note: Train test split strategy is different from pre trained model and # these tables so that the model includes test data since the model # is trained by Chainer official example. # This accuracy is just for a demo. # # accuracy = (test_df.polarity == test_df.predicted_polarity).value_counts()[ # 1 # ] / len(test_df) # print(f"Test set accuracy: {accuracy}") writer = SparkWriter(apikey=td_api_key, endpoint=endpoint, td_spark_path=jar_path) con2 = td.connect(apikey=td_api_key, endpoint=endpoint, writer=writer) td.to_td( test_df[["rowid", "predicted_polarity"]], f"{database}.{output_table}", con=con2, if_exists="replace", index=False, ) logger.info("Upload completed")