def set_libodbc_path(path): """ Set the first path that GraphLab Create will search for libodbc.so. Since ODBC requires a driver manager to be installed system-wide, we provide this to help you if it is installed in a non-standard location. GraphLab Create will also search on the system's default library paths, so if you installed your driver manager in a standard way, you shouldn't need to worry about this function. """ gl.set_runtime_config('GRAPHLAB_LIBODBC_PREFIX', path)
def load_graphlab(): if sys.version_info >= (3, 0): raise VersionError("Graphlab is only available in Python 2") start = time.clock() # noqa import graphlab gl_product_key = os.getenv('GLCREATE_PRODUCT_KEY', False) if not gl_product_key: print("Please set GLCREATE_PRODUCT_KEY") return graphlab.product_key.set_product_key(gl_product_key) # Display graphlab canvas in notebook graphlab.canvas.set_target('ipynb') # Number of workers graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 16) since = time.clock() - start print("Graphlab loaded in {:.3f} seconds".format(since)) return graphlab
import graphlab as gl import numpy as np from scipy.spatial.distance import cosine,euclidean import time import datetime from operator import itemgetter import itertools import math import multiprocessing as mp gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 32) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 32) gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',100000000000) gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE',100000000000) tfidf = False n_iter = 50 random_sample_size=5000000 k_range = np.arange(285,301,5) filename = 'tf' np.random.seed(99) n_cores = 16 docs = gl.SArray("doc_array") if tfidf: docs = gl.text_analytics.tf_idf(docs) docs.apply(lambda row: {k:round(v)+1 for k,v in row.iteritems()}) train,test = gl.text_analytics.random_split(docs,0.1) train.save("train_data_"+filename) test.save("test_data_"+filename)
import graphlab as gl import time gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/mnt/data/tmp') gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 30) dpath = '/mnt/data/' def load_data(dpath, maxn=None): cites = gl.SFrame.read_csv(dpath + 'cites.csv', column_type_hints=[int, int]) paper = gl.SFrame.read_csv(dpath + 'papers.csv', column_type_hints=[int, int]) if maxn is not None: paper = paper[paper['id'] < maxn] cites = cites[cites.apply(lambda x: x['p1'] < maxn and x['p2'] < maxn)] sg = gl.SGraph(vertices=paper, edges=cites, vid_field='id', src_field='p1', dst_field='p2') return sg def findp_update_fn(src, edge, dst): pdst = dst['parent'] psrc = src['parent'] for pid, d in pdst.iteritems(): if pid not in psrc: psrc[pid] = d + 1 src['changed'] = True
return merged_sf def eval_model(model, test, col): '''Evaluate a trained model using Kaggle scoring.''' return log_loss_raw(test[col], model.predict(test, output_type='probability')) def log_loss_raw(target, predicted): '''Calculate log_loss between target and predicted and return.''' p = predicted.apply(lambda x: min(0.99999, max(1e-5, x))) logp = p.apply(lambda x: math.log(x)) logmp = p.apply(lambda x: (math.log(1-x))) return -(target * logp + (1-target) * logmp).mean() if __name__=='__main__': gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS','/home/mraza/tmp/') gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY', 200*1024*1024*1024) gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 100*1024*1024*1024) parser=argparse.ArgumentParser(description='Spectral Features Preprocessing') parser.add_argument('-cf','--clusteral_features',help='Input File containing clusteral features', required=True) parser.add_argument('-lf','--labels_file', help='Ground Truth labels file', required=True) parser.add_argument('-of','--output_file', help='Output file', required=True) parser.add_argument('-cfk','--clusteral_key_column', required=True) parser.add_argument('-lfk','--labels_key_column', required=True) parser.add_argument('-lfv','--labels_value_column', required=True) parser.add_argument('-i','--interaction', required=True) parser.add_argument('-j','--join_type', required=False) parser.add_argument('-e','--encode', required=False) args=parser.parse_args()
# processed/ directories -- where you ran the prep_image.sh. It will # put a image-sframes/ directory with train and test SFrames in the # save_path location below. # os.chdir('/home/pablo/Kaggle/kaggle-train') # preprocessed_image_path = "processed/" preprocessed_image_path = "processed/" save_train = False save_test = True print "current working directory = %s" % os.getcwd() save_path = "./" gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", os.path.expanduser("~/data/tmp/")) #gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", "/media/pablo/OS/Users/Pablo/Downloads/Kaggle/graphlab-cache/") print "loading images" # las siguientes sentencias tarda en Pablo's notebook 400 segs aprox. # shuffle the training images1 X = gl.image_analysis.load_images(preprocessed_image_path) X["is_train"] = X["path"].apply(lambda p: "train" in p) # Add in all the relevant information in places source_f = lambda p: re.search("run-(?P<source>[^/]+)", p).group("source") X["source"] = X["path"].apply(source_f) extract_name = lambda p: re.search("[0-9]+_(right|left)", p).group(0) X["name"] = X["path"].apply(extract_name) X_train = X[X["is_train"] == True]
import random from copy import copy import os import graphlab.aggregate as agg import array import sys model_name = "pooling-2" which_model = 0 print "Running model %d, %s" % (which_model, model_name) alt_path = os.path.expanduser("~/data/tmp/") if os.path.exists(alt_path): gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", alt_path) model_path = "nn_256x256/models/model-%d-%s/" % (which_model, model_name) model_filename = model_path + "nn_model" X_train = gl.SFrame("image-sframes/train-%d/" % which_model) X_valid = gl.SFrame("image-sframes/validation-%d/" % which_model) X_test = gl.SFrame("image-sframes/test/") ################################################################################ # init_random vs random_type in ConvolutionLayer. dll = gl.deeplearning.layers nn = gl.deeplearning.NeuralNet()
parser.add_argument('-i', '--train', help='Input training matrix', required=True) parser.add_argument('-t', '--test', help='test data matrix', required=True) parser.add_argument('-d,', '--modeldir', help='Directory to save the model', default='svmmodel') parser.add_argument('-r,', '--report', help='report file', default='report.txt') args = parser.parse_args() gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/scratch') test = gl.SFrame.read_csv(args.test, delimiter='\t', header=False) train = gl.SFrame.read_csv(args.train, delimiter='\t', header=False) file_report = open(args.report, 'w') test.save('test_sframe') train.save('train_sframe') model = gl.svm_classifier.create(train, target='X1') predictions = model.predict(test) print predictions #file_report.write(predictions) results = model.evaluate(test) print results #file_report.write(results) model.save(args.modeldir)
def eval_model(model, test, col): '''Evaluate a trained model using Kaggle scoring.''' return log_loss_raw(test[col], model.predict(test, output_type='probability')) def log_loss_raw(target, predicted): '''Calculate log_loss between target and predicted and return.''' p = predicted.apply(lambda x: min(0.99999, max(1e-5, x))) logp = p.apply(lambda x: math.log(x)) logmp = p.apply(lambda x: (math.log(1 - x))) return -(target * logp + (1 - target) * logmp).mean() if __name__ == '__main__': gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/home/mraza/tmp/') gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY', 200 * 1024 * 1024 * 1024) gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 100 * 1024 * 1024 * 1024) parser = argparse.ArgumentParser( description='Spectral Features Preprocessing') parser.add_argument('-cf', '--clusteral_features', help='Input File containing clusteral features', required=True) parser.add_argument('-lf', '--labels_file', help='Ground Truth labels file', required=True)
import graphlab as gl import re import random from copy import copy import os import graphlab.aggregate as agg import array import sys # Change cache file directory to avoid overloading /var my_graphlab_cache_file_locations = '/home/zak/tmp_graphlab' gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS',my_graphlab_cache_file_locations) # gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", os.path.expanduser("~/data/tmp/")) base_path = os.getcwd() model_path = base_path + "/nn_96x96/models/" train_sf = [] test_sf = [] feature_names = [] for n in [0,1,2,3,4]: try: Xf_train = gl.SFrame(model_path + "/scores_train_%d" % n) Xf_test = gl.SFrame(model_path + "/scores_test_%d" % n) train_sf.append(Xf_train)
import graphlab as gl import loadgraph as load import time gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/mnt/data/tmp') dpath = '/mnt/data/' def childs_update_fn(src, edge, dst): pdst = dst['childs'] psrc = src['childs'] for pid, d in psrc.iteritems(): if pid not in pdst: pdst[pid] = d + 1 dst['changed'] = True else: if pdst[pid] > d + 1: pdst[pid] = d + 1 dst['changed'] = True dst['childs'] = pdst return (src, edge, dst) def find_childs(g, maxn): start = time.time() num_changed = len(g.vertices) it = 0 g.vertices['childs'] = g.vertices['__id'].apply(lambda x: {x: 0} if x < maxn else {}) while (num_changed > 0):
import graphlab as gl import loadgraph as load import time gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/mnt/data/tmp') dpath = '/mnt/data/' def node_update_fn(src, edge, dst): src['out_edges'] += 1 dst['in_edges'] += 1 return (src, edge, dst) def find_stats(g): start = time.time() g.vertices['in_edges'] = 0 g.vertices['out_edges'] = 0 g = g.triple_apply(node_update_fn, ['in_edges', 'out_edges']) print 'Triple apply all finished in: %f secs' % (time.time() - start) return g def cnt_update_fn(src, edge, dst): if dst['out_edges'] == dst['counter']: src['counter'] += 1 src['parent-cnt'] += dst['parent-cnt'] + 1 return (src, edge, dst) def find_cnt(g): start = time.time()
import random from copy import copy import os import graphlab.aggregate as agg import array import numpy as np import sys # Run this script in the same directory as the train_path = "image-sframes/train-%d/" valid_path = "image-sframes/validation-%d/" # Change cache file directory to avoid overloading /var my_graphlab_cache_file_locations = '/home/zak/tmp_graphlab' gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', my_graphlab_cache_file_locations) X_data = gl.SFrame("image-sframes/train/") def save_as_train_and_test(X, train_loc, valid_loc): # Can't just randomly sample the indices all_names = list(X["name"].unique()) n_valid = (2 * len(all_names)) / 100 random.shuffle(all_names) tr_names = gl.SArray(all_names[n_valid:]) valid_names = gl.SArray(all_names[:n_valid])
import numpy as np import itertools from collections import defaultdict from datetime import datetime import math import sys import os from scipy.stats import percentileofscore import graphlab as gl import graphlab.aggregate as agg gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/home/mraza/tmp/') gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 48) # X1,X2,X3,X4,X5,X6,X7 # 2015-10-01 12:08:41,1046885725705,1046910448494,GSM,ITN006,,1.5 # 2015-10-01 16:55:32,1046885725705,1046910448494,GSM,ITN010,,1.5 def distance(l1_lat, l1_lng, l2_lat, l2_lng): R = 6371 # Radius of the earth in km d = 0.0 try: l1_lat, l1_lng, l2_lat, l2_lng = float(l1_lat), float(l1_lng), float( l2_lat), float(l2_lng) except: l1_lat, l1_lng, l2_lat, l2_lng = 0.0, 0.0, 0.0, 0.0 dLat = (l1_lat - l2_lat) * math.pi / 180 dLon = (l1_lng - l2_lng) * math.pi / 180 a = math.sin(dLat / 2) * math.sin(dLat / 2) + math.cos( (l1_lat) * math.pi / 180) * math.cos(
tic = time.time() if is_reachable: print("Vertex {} is reachable from vertex {} - Distance: {}".format(target_vertex, source_vertex, int(distance))) else: print("Vertex {} cannot be reached from vertex {} - Distance: {}".format(target_vertex, source_vertex, int(distance))) return "Total runtime: {} seconds".format(tic-toc) if __name__ == '__main__': if(len(sys.argv) is 1): print("Please add number of workers, dataset path, source vertex, target vertex, and max recursion depth as arguments when loading script") sys.exit() else: workers = int(sys.argv[1]) path = sys.argv[2] source_vertex = long(sys.argv[3]) target_vertex = long(sys.argv[4]) # max_depth = int(sys.argv[5]) # assert max_depth >= 1 # Configure GraphLab to utilize a specific number of workers (cores) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', workers) r_job = gl.deploy.job.create(run_reachability_job, path_to_file = path, source_vertex=source_vertex, target_vertex=target_vertex) # Collect job status, result, and metrics print("Job status\n{}".format(r_job.get_status())) print("Job results\n{}".format(r_job.get_results())) print("Job metrics\n{}".format(r_job.get_metrics()))
# coding: utf-8 # # Predicting sentiment from product reviews # # # Fire up GraphLab Create # (See [Getting Started with SFrames](/notebooks/Week%201/Getting%20Started%20with%20SFrames.ipynb) for setup instructions) # In[ ]: import graphlab # In[ ]: # Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing. graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4) # # Read some product review data # # Loading reviews for a set of baby products. # In[ ]: products = graphlab.SFrame('amazon_baby.gl/') # # Let's explore this data together # # Data includes the product name, the review text and the rating of the review.
import graphlab as gl from graphlab.toolkits._main import ToolkitError import numpy as np import time import datetime gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 8) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 8) #gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',100000000000) #gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE',100000000000) raw_docs = gl.SArray("LDA_vectors/") vocab_idx = {} for line in open('vocab_idx'): line = line.strip().split('\t') vocab_idx[int(line[1])] = line[0] def formatter(row): row = eval(row) row = dict(zip([vocab_idx[term] for term in row[1][1]],row[1][2])) return row docs = raw_docs.filter(lambda x: x!="").apply(formatter) docs.save("doc_array",format='binary') docs = gl.SArray("doc_array") train,test = gl.SFrame(docs).random_split(0.9,seed=99) train = gl.text_analytics.tf_idf(train['X1']) test = gl.text_analytics.tf_idf(test['X1']) #data = {'tfidf':(train_tfidf,test_tfidf),'tf':(train,test)}
import numpy from graphlab import feature_engineering as fe from sklearn.metrics import roc_auc_score from sklearn.preprocessing import MinMaxScaler from sklearn import metrics from util import Util from evals import Eval from spectral_training import SpectralTraining DEBUG=1 if __name__=='__main__': gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS','/home/mraza/tmp/') gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY', 200*1024*1024*1024) gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 100*1024*1024*1024) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 20) parser=argparse.ArgumentParser(description='Spectral Features Preprocessing') parser.add_argument('-cf','--clusteral_features',help='Input File containing clusteral features', required=True) parser.add_argument('-lf','--labels_file', help='Ground Truth labels file', required=True) parser.add_argument('-of','--output_file', help='Output file', required=True) parser.add_argument('-cfk','--clusteral_key_column', required=True) parser.add_argument('-lfk','--labels_key_column', required=True) parser.add_argument('-lfv','--labels_value_column', required=True) parser.add_argument('-i','--interaction', required=True) parser.add_argument('-j','--join_type', required=False) parser.add_argument('-e','--encode', required=False) parser.add_argument('-ex','--exclude', required=False)
# coding: utf-8 # # Fire up GraphLab Create # # We always start with this line before using any part of GraphLab Create. It can take up to 30 seconds to load the GraphLab library - be patient! # # The first time you use GraphLab create, you must enter a product key to license the software for non-commerical academic use. To register for a free one-year academic license and obtain your key, go to [dato.com](https://dato.com/download/academic.html). # In[2]: import graphlab # Set product key on this computer. After running this cell, you will not need to re-enter your product key. graphlab.product_key.set_product_key('C7E4-BB1D-0150-A1E6-645C-66D9-D454-CC8D') # Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing. graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4) # Output active product key. graphlab.product_key.get_product_key() # # Load a tabular data set # In[3]: sf = graphlab.SFrame('people-example.csv') # # SFrame basics # In[6]: sf.head() # we can view first few lines of table
import csv import sys import math import numpy from graphlab import feature_engineering as fe from sklearn.metrics import roc_auc_score from sklearn.preprocessing import MinMaxScaler from sklearn import metrics from util import Util from evals import Eval from spectral_training import SpectralTraining DEBUG = 1 if __name__ == '__main__': gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/home/mraza/tmp/') gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY', 200 * 1024 * 1024 * 1024) gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 100 * 1024 * 1024 * 1024) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 20) parser = argparse.ArgumentParser( description='Spectral Features Preprocessing') parser.add_argument('-cf', '--clusteral_features', help='Input File containing clusteral features', required=True) parser.add_argument('-lf', '--labels_file', help='Ground Truth labels file',
#!/usr/bin/env python import graphlab as gl from sklearn.metrics import precision_recall_curve import numpy as np #import matplotlib.pyplot as plt import argparse import glob gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS','/scratch') gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',40000000000) gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 40000000000) gl.set_runtime_config('GRAPHLAB_SFRAME_SORT_BUFFER_SIZE',40000000000) parser = argparse.ArgumentParser(description='A script to get P-R curve for a model') parser.add_argument('-d','--directory', help='directory to test', required=True ) parser.add_argument('-t','--test', help = 'test data matrix', required=True) #parser.add_argument('-r,','--report',help= 'data for PR curve', required = True ) #parser.add_argument('-f,','--figure',help= 'figure name for plotting', required = True ) args = parser.parse_args() #test = gl.SFrame.read_csv('/global/projectb/scratch/arrivers/geneleanrntest/20150818/test.twoclass.txt', delimiter='\t', header=False) #train = gl.SFrame.read_csv('/global/projectb/scratch/arrivers/geneleanrntest/20150818/train.twoclass.txt', delimiter='\t', header=False) #test.save('test_twoclass_sframe') #train.save('train_twoclass_sframe') test = gl.SFrame.read_csv(args.test, delimiter='\t', header=False)
#rmses_cf = run_cf(min_lambduh, min_k, min_lambduh_w) #rmses_cf2 = run_cf2(min_lambduh, min_k, min_lambduh_w) # In[64]: def main(argv): # pylint: disable=W0612 try: argv = FLAGS(argv) # parse flags except gflags.FlagsError, e: print '%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS) sys.exit(1) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 16) for flag_name in sorted(FLAGS.RegisteredFlags()): if flag_name not in ["?", "help", "helpshort", "helpxml"]: fl = FLAGS.FlagDict()[flag_name] with open('output/main.out', 'a') as f: f.write( "# " + fl.help + " (" + flag_name + "): " + str(fl.value) + '\n') X_train, X_test = load(FLAGS.dataset) g = get_graph(X_train, FLAGS.rank) rmse_train, rmse_test, L, R, wu, wm, bu, bm = \ sgd_gl_edge(g, X_train, X_test, FLAGS.lamb, FLAGS.rank, FLAGS.eta, Niter=FLAGS.maxit, unified=FLAGS.unified, lambduh_w=FLAGS.lamb_w, output="main")