def test_parallel_evaluation(self): xin = 33 repeat = 8 # execute the task bulk using one process to get a baseline start_time = time.time() glconnect.get_unity().eval_lambda(lambda x: [fib(i) for i in x], [xin for i in range(repeat)]) single_thread_time = time.time() - start_time logging.info("Single thread lambda eval takes %s secs" % single_thread_time) # execute the task in parallel start_time = time.time() ans_list = glconnect.get_unity().parallel_eval_lambda( lambda x: fib(x), [xin for i in range(repeat)]) multi_thread_time = time.time() - start_time logging.info("Multi thread lambda eval takes %s secs" % multi_thread_time) # test the speed up by running in parallel nproc = multiprocessing.cpu_count() if (nproc > 1 and multi_thread_time > (single_thread_time / 1.5)): logging.warning( "Slow parallel processing: single thread takes %s secs, multithread on %s procs takes %s secs" % (single_thread_time, nproc, multi_thread_time)) # test accuracy ans = fib(xin) for a in ans_list: self.assertEqual(a, ans)
def test_simple_evaluation(self): x = 3 self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 0), 3) self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 1), 4) self.assertEqual(glconnect.get_unity().eval_lambda(lambda x: x.upper(), 'abc'), 'ABC') self.assertEqual(glconnect.get_unity().eval_lambda(lambda x: x.lower(), 'ABC'), 'abc') self.assertEqual(glconnect.get_unity().eval_lambda(fib, 1), 1)
def _test_read_write_helper(self, url, content): url = graphlab.util._make_internal_url(url) glconnect.get_unity().__write__(url, content) content_read = glconnect.get_unity().__read__(url) self.assertEquals(content_read, content) if os.path.exists(url): os.remove(url)
def _test_read_write_helper(self, url, content_expected): s3url = graphlab.util._make_internal_url(url) glconnect.get_unity().__write__(s3url, content_expected) content_read = glconnect.get_unity().__read__(s3url) self.assertEquals(content_read, content_expected) (status, output) = commands.getstatusoutput('aws s3 rm --region us-west-2 ' + url) if status is not 0: logging.getLogger(__name__).warning("Cannot remove file: " + url)
def test_exception(self): x = 3 self.assertRaises(RuntimeError, glconnect.get_unity().eval_lambda, lambda y: x / y, 0) self.assertRaises(RuntimeError, glconnect.get_unity().parallel_eval_lambda, lambda y: x / y, [0 for i in range(10)])
def _test_read_write_helper(self, url, content_expected): url = graphlab.util._make_internal_url(url) glconnect.get_unity().__write__(url, content_expected) content_read = glconnect.get_unity().__read__(url) self.assertEquals(content_read, content_expected) # clean up the file we wrote status, output = commands.getstatusoutput('hadoop fs -test -e ' + url) if status is 0: commands.getstatusoutput('hadoop fs -rm ' + url)
def test_exception(self): self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("/root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("/root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("/root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("/root/tmp.model"))
def test_exception(self): self.assertRaises(ValueError, lambda: self._test_read_write_helper(self.tempfile, 'hello world')) self.assertRaises(ValueError, lambda: self._test_read_write_helper("local://" + self.tempfile + ".csv.gz", 'hello,world,woof')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("remote:///root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("remote:///root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("remote:///root/tmp.model"))
def test_crash_recovery(self): ls = range(1000) def good_fun(x): return x def bad_fun(x): if (x % 251 == 0): cy_test_utils.force_exit_fun() # this will force the worker process to exit return x self.assertRaises(RuntimeError, lambda: glconnect.get_unity().parallel_eval_lambda(lambda x: bad_fun(x), ls)) glconnect.get_unity().parallel_eval_lambda(lambda x: good_fun(x), ls)
def test_simple_evaluation(self): x = 3 self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 0), 3) self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 1), 4) self.assertEqual( glconnect.get_unity().eval_lambda(lambda x: x.upper(), 'abc'), 'ABC') self.assertEqual( glconnect.get_unity().eval_lambda(lambda x: x.lower(), 'ABC'), 'abc') self.assertEqual(glconnect.get_unity().eval_lambda(fib, 1), 1)
def test_exception(self): bad_url = "hdfs:///root/" if self.has_hdfs: self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile)) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(bad_url + "x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(bad_url + "mygraph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(bad_url + "x.model")) else: logging.getLogger(__name__).info("No hdfs avaiable. Test pass.")
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ # Save to a temoporary pickle file. temp_file = tempfile.mktemp() self._save_to_pickle(temp_file) # Write the pickle file to an OARC if not self.__proxy__: self.__proxy__ = _gl.extensions._PythonModel() # The proxy contains the file. self.__proxy__.temp_file = temp_file wrapper = self._get_wrapper() return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), wrapper)
def test_crash_recovery(self): ls = range(1000) def good_fun(x): return x def bad_fun(x): if (x % 251 == 0): cy_test_utils.force_exit_fun( ) # this will force the worker process to exit return x self.assertRaises( RuntimeError, lambda: glconnect.get_unity().parallel_eval_lambda( lambda x: bad_fun(x), ls)) glconnect.get_unity().parallel_eval_lambda(lambda x: good_fun(x), ls)
def __test_model_save_load_helper__(self, model): with util.TempDirectory() as f: model.save(f) m2 = get_unity().load_model(f) self.assertItemsEqual(model.list_fields(), m2.list_fields()) for key in model.list_fields(): if type(model.get(key)) is SGraph: self.assertItemsEqual( model.get(key).summary(), m2.get(key).summary()) self.assertItemsEqual( model.get(key).get_fields(), m2.get(key).get_fields()) elif type(model.get(key)) is SFrame: sf1 = model.get(key) sf2 = m2.get(key) self.assertEqual(len(sf1), len(sf2)) self.assertItemsEqual(sf1.column_names(), sf2.column_names()) df1 = sf1.to_dataframe() print df1 df2 = sf2.to_dataframe() print df2 df1 = df1.set_index(df1.columns[0]) df2 = df2.set_index(df2.columns[0]) assert_frame_equal(df1, df2) else: if (type(model.get(key)) is pd.DataFrame): assert_frame_equal(model.get(key), m2.get(key)) else: self.assertEqual(model.get(key), m2.get(key))
def save(self, location): """ Save the transformer into a GraphLab archive. The object is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- .. sourcecode:: python >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.save') return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def test_pagerank(self): if "pagerank" in get_unity().list_toolkit_functions(): m = gl.pagerank.create(self.graph) print m m.summary() self.assertEqual((m.get('pagerank').num_rows(), m.get('pagerank').num_cols()), (self.graph.summary()['num_vertices'], 3)) self.assertAlmostEqual(m['pagerank']['pagerank'].sum(), 2727.5348, delta=1e-3) self.__test_model_save_load_helper__(m) m2 = gl.pagerank.create(self.graph, reset_probability=0.5) print m2 self.assertEqual((m2.get('pagerank').num_rows(), m2.get('pagerank').num_cols()), (self.graph.summary()['num_vertices'], 3)) self.assertAlmostEqual(m2['pagerank']['pagerank'].sum(), 7087.0791, delta=1e-3) with self.assertRaises(Exception): assert_frame_equal(m.get('pagerank').topk('pagerank'), m2.get('pagerank').topk('pagerank')) self.__test_model_save_load_helper__(m2) default_options = gl.pagerank.get_default_options() print default_options self.assertTrue(len(default_options.keys()) == 3) self.assertTrue(default_options['reset_probability'] == 0.15) self.assertTrue(default_options['threshold'] == 1e-2) self.assertTrue(default_options['max_iterations'] == 20) current_options = m2.get_current_options() print current_options self.assertTrue(len(current_options.keys()) == 3) self.assertTrue(current_options['reset_probability'] == 0.5) self.assertTrue(current_options['threshold'] == 1e-2) self.assertTrue(current_options['max_iterations'] == 20)
def __test_model_save_load_helper__(self, model): with util.TempDirectory() as f: model.save(f) m2 = get_unity().load_model(f) self.assertItemsEqual(model.list_fields(), m2.list_fields()) for key in model.list_fields(): if type(model.get(key)) is SGraph: self.assertItemsEqual(model.get(key).summary(), m2.get(key).summary()) self.assertItemsEqual(model.get(key).get_fields(), m2.get(key).get_fields()) elif type(model.get(key)) is SFrame: sf1 = model.get(key) sf2 = m2.get(key) self.assertEqual(len(sf1), len(sf2)) self.assertItemsEqual(sf1.column_names(), sf2.column_names()) df1 = sf1.to_dataframe() print df1 df2 = sf2.to_dataframe() print df2 df1 = df1.set_index(df1.columns[0]) df2 = df2.set_index(df2.columns[0]) assert_frame_equal(df1, df2) else: if (type(model.get(key)) is pd.DataFrame): assert_frame_equal(model.get(key), m2.get(key)) else: self.assertEqual(model.get(key), m2.get(key))
def test_exception(self): if self.has_s3: bad_bucket = "i_am_a_bad_bucket" prefix = "s3://" + bad_bucket self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(prefix + "/x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(prefix + "/x.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(prefix + "/x.model")) else: logging.getLogger(__name__).info("No s3 bucket avaiable. Test pass.")
def get_graphlab_object_type(url): ''' Given url where a GraphLab Create object is persisted, return the GraphLab Create object type: 'model', 'graph', 'sframe', or 'sarray' ''' ret = _glconnect.get_unity().get_graphlab_object_type(_make_internal_url(url)) # to be consistent, we use sgraph instead of graph here if ret == 'graph': ret = 'sgraph' return ret
def save(self, url): """ Save the neuralnet to url. Parameters ---------- url : str The URL to save the network. Examples -------- >>> import graphlab as gl >>> net = gl.deeplearning.get_builtin_neuralnet('mnist') >>> net.save('mnist.conf') See Also -------- graphlab.deeplearning.load """ _gl_connect.get_unity().__write__(_make_internal_url(url), self.__config_str__())
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # We need to fix this sometime, but here is the explanation of the stupid # check below: # # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http if (not file_util.get_protocol(location) in ['http', 'https']) and \ (not file_util.exists(location + '/dir_archive.ini')): # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model else: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url)
def test_graph_coloring(self): if "graph_coloring" in get_unity().list_toolkit_functions(): m = gl.graph_coloring.create(self.graph) print m m.summary() # coloring is non-deterministic, so we cannot verify the result here self.__test_model_save_load_helper__(m) default_options = gl.graph_coloring.get_default_options() self.assertTrue(len(default_options.keys()) == 0) current_options = m.get_current_options() self.assertTrue(len(current_options.keys()) == 0)
def test_triangle_counting(self): if "triangle_counting" in get_unity().list_toolkit_functions(): m = gl.triangle_counting.create(self.graph) print m m.summary() self.__test_model_save_load_helper__(m) self.assertEqual(m.get('num_triangles'), 934) default_options = gl.triangle_counting.get_default_options() self.assertTrue(len(default_options.keys()) == 0) current_options = m.get_current_options() self.assertTrue(len(current_options.keys()) == 0)
def test_parallel_evaluation(self): xin = 33 repeat = 8 # execute the task bulk using one process to get a baseline start_time = time.time() glconnect.get_unity().eval_lambda(lambda x: [fib(i) for i in x], [xin for i in range(repeat)]) single_thread_time = time.time() - start_time logging.info("Single thread lambda eval takes %s secs" % single_thread_time) # execute the task in parallel start_time = time.time() ans_list = glconnect.get_unity().parallel_eval_lambda(lambda x: fib(x), [xin for i in range(repeat)]) multi_thread_time = time.time() - start_time logging.info("Multi thread lambda eval takes %s secs" % multi_thread_time) # test the speed up by running in parallel nproc = multiprocessing.cpu_count() if (nproc > 1 and multi_thread_time > (single_thread_time / 1.5)): logging.warning("Slow parallel processing: single thread takes %s secs, multithread on %s procs takes %s secs" % (single_thread_time, nproc, multi_thread_time)) # test accuracy ans = fib(xin) for a in ans_list: self.assertEqual(a, ans)
def test_connected_component(self): if "connected_component" in get_unity().list_toolkit_functions(): m = gl.connected_components.create(self.graph) print m m.summary() print m.get('component_id') print m.get('component_size') self.assertEqual(m['component_size'].num_rows(), 1) self.__test_model_save_load_helper__(m) default_options = gl.connected_components.get_default_options() self.assertTrue(len(default_options.keys()) == 0) current_options = m.get_current_options() self.assertTrue(len(current_options.keys()) == 0)
def get_runtime_config(): """ Returns all the GraphLab Create configuration variables that can be set at runtime. See :py:func:`graphlab.set_runtime_config()` to set these values and for documentation on the effect of each variable. Parameters ---------- None Returns ------- Returns a dictionary of {key:value,..} """ unity = _glconnect.get_unity() return unity.list_globals(True)
def test_shortest_path(self): if "sssp" in get_unity().list_toolkit_functions(): m = gl.shortest_path.create(self.graph, source_vid=0) print m m.summary() self.__test_model_save_load_helper__(m) m2 = gl.shortest_path.create(self.graph, source_vid=0) print m2 self.__test_model_save_load_helper__(m2) # Test get_path function on a simple chain graph and star graph chain_graph = gl.SGraph().add_edges( [gl.Edge(i, i + 1) for i in range(10)]) m3 = gl.shortest_path.create(chain_graph, source_vid=0) for i in range(10): self.assertSequenceEqual(m3.get_path(i), [(j, float(j)) for j in range(i + 1)]) star_graph = gl.SGraph().add_edges( [gl.Edge(0, i + 1) for i in range(10)]) m4 = gl.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m4.get_path(i), [(0, 0.0), (i, 1.0)]) # Test sssp ignoring the existing distance field star_graph.vertices['distance'] = 0 m5 = gl.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)]) default_options = gl.shortest_path.get_default_options() print default_options self.assertTrue(len(default_options.keys()) == 2) self.assertTrue(default_options['weight_field'] == "") self.assertTrue(default_options['max_distance'] == 1e30) m6 = gl.shortest_path.create(chain_graph, source_vid=0, max_distance=3) current_options = m6.get_current_options() print current_options self.assertTrue(len(current_options.keys()) == 2) self.assertTrue(current_options['weight_field'] == "") self.assertTrue(current_options['max_distance'] == 3)
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') try: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url) except Exception as e: if isinstance(e, ToolkitError): raise else: # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Note that the diverse_sampler stores the data internally, so you can save the model, then load it later and sample from the loaded model immediately. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- .. sourcecode:: python >>> ground_set = graphlab.SFrame({'id': [0, 1, 2], 'feature_1': [3, 1, 2], 'feature_2': [[0, 1], [0, 1], [1, 0]]}) >>> sampler = graphlab.diversity.diverse_sampler.create(data=ground_set, item_id='id', quality_feature='feature_1', similarity_features=['feature_2']) >>> sampler.save('my_sampler') >>> loaded_sampler = graphlab.load_model('my_sampler') >>> loaded_sampler.sample(k=2) +-----------+------------+----+ | feature_1 | feature_2 | id | +-----------+------------+----+ | 2 | [0.0, 1.0] | 1 | | 1 | [1.0, 0.0] | 2 | +-----------+------------+----+ """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.save') return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def save(self, location): """ Parameters ---------- location: str Filename. Returns ------- out: None Examples -------- """ return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') return glconnect.get_unity().load_model(_make_internal_url(location))
def load_model(location): """ Load any graphlab model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') return glconnect.get_unity().load_model(make_internal_url(location))
def test_shortest_path(self): if "sssp" in get_unity().list_toolkit_functions(): m = gl.shortest_path.create(self.graph, source_vid=0) print m m.summary() self.__test_model_save_load_helper__(m) m2 = gl.shortest_path.create(self.graph, source_vid=0) print m2 self.__test_model_save_load_helper__(m2) # Test get_path function on a simple chain graph and star graph chain_graph = gl.SGraph().add_edges([gl.Edge(i, i + 1) for i in range(10)]) m3 = gl.shortest_path.create(chain_graph, source_vid=0) for i in range(10): self.assertSequenceEqual(m3.get_path(i), [(j, float(j)) for j in range(i + 1)]) star_graph = gl.SGraph().add_edges([gl.Edge(0, i + 1) for i in range(10)]) m4 = gl.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m4.get_path(i), [(0, 0.0), (i, 1.0)]) # Test sssp ignoring the existing distance field star_graph.vertices['distance'] = 0 m5 = gl.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)]) default_options = gl.shortest_path.get_default_options() print default_options self.assertTrue(len(default_options.keys()) == 2) self.assertTrue(default_options['weight_field'] == "") self.assertTrue(default_options['max_distance'] == 1e30) m6 = gl.shortest_path.create(chain_graph, source_vid=0, max_distance=3) current_options = m6.get_current_options() print current_options self.assertTrue(len(current_options.keys()) == 2) self.assertTrue(current_options['weight_field'] == "") self.assertTrue(current_options['max_distance'] == 3)
def test_pagerank(self): if "pagerank" in get_unity().list_toolkit_functions(): m = gl.pagerank.create(self.graph) print m m.summary() self.assertEqual( (m.get('pagerank').num_rows(), m.get('pagerank').num_cols()), (self.graph.summary()['num_vertices'], 3)) self.assertAlmostEqual(m['pagerank']['pagerank'].sum(), 2727.5348, delta=1e-3) self.__test_model_save_load_helper__(m) m2 = gl.pagerank.create(self.graph, reset_probability=0.5) print m2 self.assertEqual( (m2.get('pagerank').num_rows(), m2.get('pagerank').num_cols()), (self.graph.summary()['num_vertices'], 3)) self.assertAlmostEqual(m2['pagerank']['pagerank'].sum(), 7087.0791, delta=1e-3) with self.assertRaises(Exception): assert_frame_equal( m.get('pagerank').topk('pagerank'), m2.get('pagerank').topk('pagerank')) self.__test_model_save_load_helper__(m2) default_options = gl.pagerank.get_default_options() print default_options self.assertTrue(len(default_options.keys()) == 3) self.assertTrue(default_options['reset_probability'] == 0.15) self.assertTrue(default_options['threshold'] == 1e-2) self.assertTrue(default_options['max_iterations'] == 20) current_options = m2.get_current_options() print current_options self.assertTrue(len(current_options.keys()) == 3) self.assertTrue(current_options['reset_probability'] == 0.5) self.assertTrue(current_options['threshold'] == 1e-2) self.assertTrue(current_options['max_iterations'] == 20)
def test_kcore(self): if "kcore" in get_unity().list_toolkit_functions(): m = gl.kcore.create(self.graph) print m m.summary() biggest_core = m['core_id'].groupby('core_id', gl.aggregate.COUNT).topk('Count').head(1) self.assertEqual(biggest_core['core_id'][0], 6) self.assertEqual(biggest_core['Count'][0], 4492) self.__test_model_save_load_helper__(m) default_options = gl.kcore.get_default_options() print default_options self.assertTrue(len(default_options.keys()) == 2) self.assertTrue(default_options['kmin'] == 0) self.assertTrue(default_options['kmax'] == 10) m2 = gl.kcore.create(self.graph, kmin = 1, kmax = 5) current_options = m2.get_current_options() print current_options self.assertTrue(len(current_options.keys()) == 2) self.assertTrue(current_options['kmin'] == 1) self.assertTrue(current_options['kmax'] == 5)
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.save') return glconnect.get_unity().save_model(self, make_internal_url(location))
def test_kcore(self): if "kcore" in get_unity().list_toolkit_functions(): m = gl.kcore.create(self.graph) print m m.summary() biggest_core = m['core_id'].groupby( 'core_id', gl.aggregate.COUNT).topk('Count').head(1) self.assertEqual(biggest_core['core_id'][0], 6) self.assertEqual(biggest_core['Count'][0], 4492) self.__test_model_save_load_helper__(m) default_options = gl.kcore.get_default_options() print default_options self.assertTrue(len(default_options.keys()) == 2) self.assertTrue(default_options['kmin'] == 0) self.assertTrue(default_options['kmax'] == 10) m2 = gl.kcore.create(self.graph, kmin=1, kmax=5) current_options = m2.get_current_options() print current_options self.assertTrue(len(current_options.keys()) == 2) self.assertTrue(current_options['kmin'] == 1) self.assertTrue(current_options['kmax'] == 5)
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- >>> model.save('my_model_file') >>> loaded_model = graphlab.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.save') return glconnect.get_unity().save_model(self, _make_internal_url(location))
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location: str Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples -------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def get_environment_config(): """ Returns all the GraphLab Create configuration variables that can only be set via environment variables. - *GRAPHLAB_FILEIO_WRITER_BUFFER_SIZE* The file write buffer size. - *GRAPHLAB_FILEIO_READER_BUFFER_SIZE* The file read buffer size. - *OMP_NUM_THREADS* The maximum number of threads to use for parallel processing. Parameters ---------- None Returns ------- Returns a dictionary of {key:value,..} """ unity = _glconnect.get_unity() return unity.list_globals(False)
def set_runtime_config(name, value): """ Sets a runtime configuration value. These configuration values are also read from environment variables at program startup if available. See :py:func:`graphlab.get_runtime_config()` to get the current values for each variable. The default configuration is conservatively defined for machines with about 4-8GB of RAM. Note that defaults may change across GraphLab Create versions and the names of performance tuning constants may also change as improved algorithms are developed and implemented. **Basic Configuration Variables** - *GRAPHLAB_CACHE_FILE_LOCATIONS* The directory in which intermediate SFrames/SArray are stored. For instance "/var/tmp". Multiple directories can be specified separated by a colon (ex: "/var/tmp:/tmp") in which case intermediate SFrames will be striped across both directories (useful for specifying multiple disks). Defaults to /var/tmp if the directory exists, /tmp otherwise. - *GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY* The maximum amount of memory which will be occupied by *all* intermediate SFrames/SArrays. Once this limit is exceeded, SFrames/SArrays will be flushed out to temporary storage (as specified by `GRAPHLAB_CACHE_FILE_LOCATIONS`). On large systems increasing this as well as `GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE` can improve performance significantly. Defaults to 2147483648 bytes (2GB). - *GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE* The maximum amount of memory which will be occupied by any individual intermediate SFrame/SArray. Once this limit is exceeded, the SFrame/SArray will be flushed out to temporary storage (as specified by `GRAPHLAB_CACHE_FILE_LOCATIONS`). On large systems, increasing this as well as `GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY` can improve performance significantly for large SFrames. Defaults to 134217728 bytes (128MB). **ODBC Configuration** - *GRAPHLAB_LIBODBC_PREFIX* A directory containing libodbc.so. Also see :func:`graphlab.set_libodbc_path` and :func:`graphlab.connect_odbc` - *GRAPHLAB_ODBC_BUFFER_MAX_ROWS* The number of rows to read from ODBC in each batch. Increasing this may give better performance at increased memory consumption. Defaults to 2000. - *GRAPHLAB_ODBC_BUFFER_SIZE* The maximum ODBC buffer size in bytes when reading. Increasing this may give better performance at increased memory consumption. Defaults to 3GB. **Sort Performance Configuration** - *GRAPHLAB_SFRAME_SORT_PIVOT_ESTIMATION_SAMPLE_SIZE* The number of random rows to sample from the SFrame to estimate the sort pivots used to partition the sort. Defaults to 2000000. - *GRAPHLAB_SFRAME_SORT_BUFFER_SIZE* The maximum estimated memory consumption sort is allowed to use. Increasing this will increase the size of each sort partition, and will increase performance with increased memory consumption. Defaults to 2GB. **Join Performance Configuration** - *GRAPHLAB_SFRAME_JOIN_BUFFER_NUM_CELLS* The maximum number of cells to buffer in memory. Increasing this will increase the size of each join partition and will increase performance with increased memory consumption. If you have very large cells (very long strings for instance), decreasing this value will help decrease memory consumption. Defaults to 52428800. **Groupby Aggregate Performance Configuration** - *GRAPHLAB_SFRAME_GROUPBY_BUFFER_NUM_ROWS* The number of groupby keys cached in memory. Increasing this will increase performance with increased memory consumption. Defaults to 1048576. **Advanced Configuration Variables** - *GRAPHLAB_SFRAME_FILE_HANDLE_POOL_SIZE* The maximum number of file handles to use when reading SFrames/SArrays. Once this limit is exceeded, file handles will be recycled, reducing performance. This limit should be rarely approached by most SFrame/SArray operations. Large SGraphs however may create a large a number of SFrames in which case increasing this limit may improve performance (You may also need to increase the system file handle limit with "ulimit -n"). Defaults to 128. ---------- name: A string referring to runtime configuration variable. value: The value to set the variable to. Returns ------- Nothing Raises ------ A RuntimeError if the key does not exist, or if the value cannot be changed to the requested value. """ unity = _glconnect.get_unity() ret = unity.set_global(name, value) if ret != "": raise RuntimeError(ret)
def test_exception(self): self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(self.url, '.....'))