from pymldb import Connection mldb = Connection("http://localhost/") mldb.put( '/v1/procedures/import_bench_train_1m', { "type": "import.text", "params": { "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv", "outputDataset": "bench_train_1m", "runOnCreation": True } }) mldb.put( '/v1/procedures/import_bench_test', { "type": "import.text", "params": { "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/test.csv", "outputDataset": "bench_test", "runOnCreation": True } }) mldb.put( '/v1/procedures/benchmark', { "type": "classifier.experiment", "params": { "experimentName": "benchm_ml", "inputData": """ select
from pymldb import Connection mldb = Connection("http://localhost:8080") inceptionUrl = 'http://public.mldb.ai/models/inception_dec_2015.zip' mldb.put('/v1/functions/fetch', {"type": 'fetcher', "params": {}}) print("done") mldb.put( '/v1/functions/inception', { "type": 'tensorflow.graph', "params": { "modelFileUrl": 'archive+' + inceptionUrl + '#tensorflow_inception_graph.pb', "inputs": 'fetch({url})[content] AS "DecodeJpeg/contents"', "outputs": "softmax" } }) print("done") mldb.put( "/v1/procedures/imagenet_labels_importer", { "type": "import.text", "params": { "dataFileUrl": 'archive+' + inceptionUrl + '#imagenet_comp_graph_label_strings.txt', "outputDataset": { "id": "imagenet_labels", "type": "sparse.mutable" },
from pymldb import Connection mldb = Connection()#starts connection with mldb mldb.put('/v1/datasets/raw_data', { #loads in raw data "type":"text.csv.tabular", "params": { "dataFileUrl":"file:///mldb_data/sample.csv", 'delimiter':'', 'quotechar':'' } }) mldb.put('/v1/procedures/sparse_matrix',{ "type":"transform", "params":{ "inputData":"select tokenize(lineText,{offset:1, value:1}) as * from raw_data", "outputDataset":"sparse_matrix", "runOnCreation":True } }) mldb.put('/v1/procedures/svd_matrix', { "type" : "svd.train", "params" : { "trainingData" : """ SELECT COLUMN EXPR (AS columnName() ORDER BY rowCount() DESC, columnName() LIMIT 4000) FROM sparse_matrix """, "columnOutputDataset" : "location_svd_embedding", "modelFileUrl" : "file://svd/svd_matrix.svd", "functionName": "location_svd_embedder",
class Candidate_Predictor: candidates = [] candidate_favor = {} mldb = None theta = [] depth = False def __init__(self, port=8080, pool=all_candidates, depth=False): self.mldb = Connection(host="http://localhost:{0}".format(port)) self.set_wordnet() self.candidates = pool self.depth = depth # Tickles SentiWordnet, removing POS data def set_wordnet(self): self.mldb.put('/v1/procedures/sentiwordneter', { "type": "import.sentiwordnet", "params": { "dataFileUrl": "file:///mldb_data/SentiWordNet_3.0.0_20130122.txt", "outputDataset": "sentiwordnet", "runOnCreation": True } }) self.mldb.put("/v1/procedures/baseWorder", { "type": "transform", "params": { "inputData": """ select *, jseval(' return x.split("#")[0]; ', 'x', rowName()) as baseWord from sentiwordnet """, "outputDataset": "senti_clean", "runOnCreation": True } }) self.mldb.put("/v1/procedures/baseWorder", { "type": "transform", "params": { "inputData": """ select avg({* EXCLUDING(baseWord)}) as avg, min({* EXCLUDING(baseWord)}) as min, max({* EXCLUDING(baseWord)}) as max, count(*) as cnt NAMED baseWord from senti_clean group by baseWord order by cnt desc """, "outputDataset": "senti_clean2", "runOnCreation": True } }) # check sentiment of sentence def return_sent(self, sentence): # remove quotes because it messes with query no_quote = sentence.replace("'", '') split = list(set(no_quote.split(' '))) join = "','".join(split) sent_sent = self.mldb.query("select avg* from senti_clean2 where rowName() in ('{0}')".format(join)) overall_senti = 0 if 'avg.NegSenti' in sent_sent.keys(): for word in sent_sent['avg.NegSenti'].keys(): overall_senti += sent_sent['avg.PosSenti'][word]-sent_sent['avg.NegSenti'][word] return overall_senti # run tweet csvs about candidates through sentiment analysis def run_candidates(self): for candidate in self.candidates: states = {} counter = 0 with open('data/{0}.csv'.format(candidate), 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') for row in spamreader: if self.depth and counter > self.depth: break elif len(row) == 2 and row[1]: state = normalize_state_name(row[1]) if state is None: pass else: overall_senti = self.return_sent(row[0]) if state not in states: states[state] = overall_senti else: states[state] = states[state]+overall_senti counter += 1 else: pass self.candidate_favor[candidate] = states # Returns the sentiment value stored in the database def get_sentiment_value(self, candidate, state): self.candidate_favor[all_candidates[candidate]][state] # Gets all the results stored in the database def get_results(self): # Format: [[CANDIDATE_ID, STATE_ID, VOTE_PERCENTAGE], ...] pass # Calculates the parameteres using normal equations def calculate_params(self): inp = [] out = [] results = self.get_results() for r in results: candidate = r[0] state = r[1] # TODO: More inputs inp.append(self.get_sentiment_value(candidate, state)) out.append(r[3]) # Linear regression x = np.array(inp) self.theta = np.multiply(np.multiply(np.linalg.inv(np.multiply(np.transpose(x), x)), np.transpose(x)), out) # Trains all the sentiment values based on the expected results def train(self): self.run_candidates() self.calculate_params() # Predicts the situation for a given list of candidates for a specific state # Returns a map of the percentage each candidate is predicted to have def predict(self, state): results = {} for candidate in self.candidates: inp = [self.get_sentiment_value(candidate, state)] # TODO: WTF results[candidate] = np.multiply(self.theta, inp) return results def save(self, file='sentiment.csv'): with open(file, 'w') as csvfile: fieldnames = ['candidate']+state_code.values() writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for candidate in self.candidates: self.candidate_favor[candidate]['candidate'] = candidate writer.writerow(self.candidate_favor[candidate])