from pymldb import Connection mldb = Connection("http://localhost/") mldb.put( '/v1/procedures/import_bench_train_1m', { "type": "import.text", "params": { "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv", "outputDataset": "bench_train_1m", "runOnCreation": True } }) mldb.put( '/v1/procedures/import_bench_test', { "type": "import.text", "params": { "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/test.csv", "outputDataset": "bench_test", "runOnCreation": True } }) mldb.put( '/v1/procedures/benchmark', { "type": "classifier.experiment", "params": { "experimentName": "benchm_ml", "inputData": """ select
import sys from random import randint from pymldb import Connection mldb = Connection("http://localhost:8080") url_list = open('./url_list.txt', 'r').read().splitlines() # url = url_list[randint(0, len(url_list) - 1)] url = url_list[int(sys.argv[1])] result = mldb.query(""" SELECT scores.pred as score NAMED imagenet_labels.label FROM transpose( ( SELECT flatten(inception({url: '%s'})[softmax]) as * NAMED 'pred' ) ) AS scores LEFT JOIN imagenet_labels ON imagenet_labels.rowName() = scores.rowName() ORDER BY score DESC LIMIT 10 """ % url) print(url) print(str(result.index[0])) print(str(result['score'][0]))
from pymldb import Connection from pymldb.resource import ResourceError from rec.settings import HOST, PREFIX def title(title, linechar='-'): return '\n' + ' {} '.format(title).center(80, linechar) + '\n' if __name__ == '__main__': if len(sys.argv) > 1: whats = sys.argv[1:] else: whats = ['datasets', 'procedures', 'functions'] mldb = Connection(HOST) prefix = PREFIX for what in whats: print(title(what.upper(), '=')) for x in getattr(mldb, what).get(): if x.startswith(prefix): print(title(x)) print('CONFIG') pprint(getattr(mldb, what)(x).get_query()) if what == 'functions': print('INFO') pprint(getattr(mldb, what)(x).info.get_query()) if what == 'datasets': try: head = DataFrame( getattr(mldb,
from pprint import pprint from pymldb import Connection import rec.settings as _ def run_score_pipeline(mldb): # mldb.procedures(_.DATASET_MANAGER_TEST).runs.post_json({}) # mldb.procedures(_.FEATURE_GENERATION_TEST).runs.post_json({}) # r = mldb.datasets(_.TEST_FEATS_DATASET).query.get_query( # select='APPLY FUNCTION {} WITH(object(select *) as features) EXTRACT(*)'.format(_.SCORE), # limit=100, # format='aos') # pprint(r) r = mldb.datasets(_.ACTIONS_DATASET).query.get_query( select='user_id, apply function {} with(*) extract(*), apply function', limit=10, format='aos') pprint(r) if __name__ == '__main__': mldb = Connection(_.HOST) run_score_pipeline(mldb)
import sys from pymldb import Connection import rec.settings as _ from pprint import pprint if __name__ == '__main__': mldb = Connection(_.HOST) if len(sys.argv) <= 1: pprint(mldb.datasets.get()) else: dataset_name = sys.argv[1] res = mldb.datasets(dataset_name).query.get_query(limit=1, format='table') pprint(res[0])
from pymldb import Connection mldb = Connection()#starts connection with mldb mldb.put('/v1/datasets/raw_data', { #loads in raw data "type":"text.csv.tabular", "params": { "dataFileUrl":"file:///mldb_data/sample.csv", 'delimiter':'', 'quotechar':'' } }) mldb.put('/v1/procedures/sparse_matrix',{ "type":"transform", "params":{ "inputData":"select tokenize(lineText,{offset:1, value:1}) as * from raw_data", "outputDataset":"sparse_matrix", "runOnCreation":True } }) mldb.put('/v1/procedures/svd_matrix', { "type" : "svd.train", "params" : { "trainingData" : """ SELECT COLUMN EXPR (AS columnName() ORDER BY rowCount() DESC, columnName() LIMIT 4000) FROM sparse_matrix """, "columnOutputDataset" : "location_svd_embedding", "modelFileUrl" : "file://svd/svd_matrix.svd", "functionName": "location_svd_embedder",
dataFileUrl='file://' + filename, encoding='us-ascii')) if __name__ == '__main__': parser = argparse.ArgumentParser('load data from files into mldb') tsv_gz_help = '.tsv.gz file, path relative to mldb data dir' parser.add_argument('--sdb-dump', required=True, help='the big .gz file') # '(path relative to mldb data dir)') parser.add_argument('--users', required=True, help=tsv_gz_help) parser.add_argument('--events', required=True, help=tsv_gz_help) parser.add_argument('--purchases', required=True, help=tsv_gz_help) args = parser.parse_args() # logging.basicConfig(level=logging.INFO) mldb = Connection(_.HOST) dataset = mldb.create_dataset(dataset_conf(_.ACTION_DATASET, 'beh.mutable')) from multiprocessing import Pool, Process import signal signal.signal(signal.SIGINT, signal.SIG_DFL) n = 8 def make_keep_if(i,n): def foo(row): return row % n == i return foo # load_sdb(mldb, args.sdb_dump, dataset, None, None)
from pymldb import Connection mldb = Connection("http://localhost:8080") inceptionUrl = 'http://public.mldb.ai/models/inception_dec_2015.zip' mldb.put('/v1/functions/fetch', {"type": 'fetcher', "params": {}}) print("done") mldb.put( '/v1/functions/inception', { "type": 'tensorflow.graph', "params": { "modelFileUrl": 'archive+' + inceptionUrl + '#tensorflow_inception_graph.pb', "inputs": 'fetch({url})[content] AS "DecodeJpeg/contents"', "outputs": "softmax" } }) print("done") mldb.put( "/v1/procedures/imagenet_labels_importer", { "type": "import.text", "params": { "dataFileUrl": 'archive+' + inceptionUrl + '#imagenet_comp_graph_label_strings.txt', "outputDataset": { "id": "imagenet_labels", "type": "sparse.mutable" },
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--date', '-d', required=True, help='date YYYY-MM-DD that separates' ' training and testing') parser.add_argument('--nb-train', '-n', default=2000, type=int, help='nb training examples') args = parser.parse_args() mldb = Connection(_.HOST) # missing bit of datasets, the user/item sparse matrix where_commun = ["verb='rate'"] params = dict(select='scatter(item_id, compl) AS *', groupBy='user_id', rowName='user_id', inputDataset=dataset_conf(_.ACTION_DATASET)) # the one for training the SVD, only on the train set mldb.create_procedure( _.USER_ITEM_DATASET_MAKER, 'transform', where=' AND '.join(where_commun + get_where_train(args.date)), outputDataset=dataset_conf(_.USER_ITEM_DATASET, 'beh.mutable'), **params)
def __init__(self, port=8080, pool=all_candidates, depth=False): self.mldb = Connection(host="http://localhost:{0}".format(port)) self.set_wordnet() self.candidates = pool self.depth = depth
class Candidate_Predictor: candidates = [] candidate_favor = {} mldb = None theta = [] depth = False def __init__(self, port=8080, pool=all_candidates, depth=False): self.mldb = Connection(host="http://localhost:{0}".format(port)) self.set_wordnet() self.candidates = pool self.depth = depth # Tickles SentiWordnet, removing POS data def set_wordnet(self): self.mldb.put('/v1/procedures/sentiwordneter', { "type": "import.sentiwordnet", "params": { "dataFileUrl": "file:///mldb_data/SentiWordNet_3.0.0_20130122.txt", "outputDataset": "sentiwordnet", "runOnCreation": True } }) self.mldb.put("/v1/procedures/baseWorder", { "type": "transform", "params": { "inputData": """ select *, jseval(' return x.split("#")[0]; ', 'x', rowName()) as baseWord from sentiwordnet """, "outputDataset": "senti_clean", "runOnCreation": True } }) self.mldb.put("/v1/procedures/baseWorder", { "type": "transform", "params": { "inputData": """ select avg({* EXCLUDING(baseWord)}) as avg, min({* EXCLUDING(baseWord)}) as min, max({* EXCLUDING(baseWord)}) as max, count(*) as cnt NAMED baseWord from senti_clean group by baseWord order by cnt desc """, "outputDataset": "senti_clean2", "runOnCreation": True } }) # check sentiment of sentence def return_sent(self, sentence): # remove quotes because it messes with query no_quote = sentence.replace("'", '') split = list(set(no_quote.split(' '))) join = "','".join(split) sent_sent = self.mldb.query("select avg* from senti_clean2 where rowName() in ('{0}')".format(join)) overall_senti = 0 if 'avg.NegSenti' in sent_sent.keys(): for word in sent_sent['avg.NegSenti'].keys(): overall_senti += sent_sent['avg.PosSenti'][word]-sent_sent['avg.NegSenti'][word] return overall_senti # run tweet csvs about candidates through sentiment analysis def run_candidates(self): for candidate in self.candidates: states = {} counter = 0 with open('data/{0}.csv'.format(candidate), 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') for row in spamreader: if self.depth and counter > self.depth: break elif len(row) == 2 and row[1]: state = normalize_state_name(row[1]) if state is None: pass else: overall_senti = self.return_sent(row[0]) if state not in states: states[state] = overall_senti else: states[state] = states[state]+overall_senti counter += 1 else: pass self.candidate_favor[candidate] = states # Returns the sentiment value stored in the database def get_sentiment_value(self, candidate, state): self.candidate_favor[all_candidates[candidate]][state] # Gets all the results stored in the database def get_results(self): # Format: [[CANDIDATE_ID, STATE_ID, VOTE_PERCENTAGE], ...] pass # Calculates the parameteres using normal equations def calculate_params(self): inp = [] out = [] results = self.get_results() for r in results: candidate = r[0] state = r[1] # TODO: More inputs inp.append(self.get_sentiment_value(candidate, state)) out.append(r[3]) # Linear regression x = np.array(inp) self.theta = np.multiply(np.multiply(np.linalg.inv(np.multiply(np.transpose(x), x)), np.transpose(x)), out) # Trains all the sentiment values based on the expected results def train(self): self.run_candidates() self.calculate_params() # Predicts the situation for a given list of candidates for a specific state # Returns a map of the percentage each candidate is predicted to have def predict(self, state): results = {} for candidate in self.candidates: inp = [self.get_sentiment_value(candidate, state)] # TODO: WTF results[candidate] = np.multiply(self.theta, inp) return results def save(self, file='sentiment.csv'): with open(file, 'w') as csvfile: fieldnames = ['candidate']+state_code.values() writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for candidate in self.candidates: self.candidate_favor[candidate]['candidate'] = candidate writer.writerow(self.candidate_favor[candidate])
## https://github.com/szilard/benchm-ml/issues/25 ## from @nicolaskruchten, thanks :) ## This code gives an AUC of 0.7431 in 19.1s for the 1M training set on an r3.8xlarge EC2 instance ## with the latest release of Datacratic's Machine Learning Database (MLDB), available at http://mldb.ai/ from pymldb import Connection mldb = Connection("http://localhost/") mldb.v1.datasets("bench-train-1m").put({ "type": "text.csv.tabular", "params": { "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv" } }) mldb.v1.datasets("bench-test").put({ "type": "text.csv.tabular", "params": { "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/test.csv" } }) mldb.v1.procedures("benchmark").put({ "type": "classifier.experiment", "params": { "experimentName": "benchm_ml", "training_dataset": { "id": "bench-train-1m" },