Пример #1
0
    def test_parallel_evaluation(self):
        xin = 33
        repeat = 8
        # execute the task bulk using one process to get a baseline
        start_time = time.time()
        glconnect.get_unity().eval_lambda(lambda x: [fib(i) for i in x],
                                          [xin for i in range(repeat)])
        single_thread_time = time.time() - start_time
        logging.info("Single thread lambda eval takes %s secs" %
                     single_thread_time)

        # execute the task in parallel
        start_time = time.time()
        ans_list = glconnect.get_unity().parallel_eval_lambda(
            lambda x: fib(x), [xin for i in range(repeat)])
        multi_thread_time = time.time() - start_time
        logging.info("Multi thread lambda eval takes %s secs" %
                     multi_thread_time)

        # test the speed up by running in parallel
        nproc = multiprocessing.cpu_count()
        if (nproc > 1 and multi_thread_time > (single_thread_time / 1.5)):
            logging.warning(
                "Slow parallel processing: single thread takes %s secs, multithread on %s procs takes %s secs"
                % (single_thread_time, nproc, multi_thread_time))

        # test accuracy
        ans = fib(xin)
        for a in ans_list:
            self.assertEqual(a, ans)
Пример #2
0
 def test_simple_evaluation(self):
     x = 3
     self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 0), 3)
     self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 1), 4)
     self.assertEqual(glconnect.get_unity().eval_lambda(lambda x: x.upper(), 'abc'), 'ABC')
     self.assertEqual(glconnect.get_unity().eval_lambda(lambda x: x.lower(), 'ABC'), 'abc')
     self.assertEqual(glconnect.get_unity().eval_lambda(fib, 1), 1)
Пример #3
0
 def _test_read_write_helper(self, url, content):
     url = graphlab.util._make_internal_url(url)
     glconnect.get_unity().__write__(url, content)
     content_read = glconnect.get_unity().__read__(url)
     self.assertEquals(content_read, content)
     if os.path.exists(url):
         os.remove(url)
Пример #4
0
 def _test_read_write_helper(self, url, content_expected):
     s3url = graphlab.util._make_internal_url(url)
     glconnect.get_unity().__write__(s3url, content_expected)
     content_read = glconnect.get_unity().__read__(s3url)
     self.assertEquals(content_read, content_expected)
     (status, output) = commands.getstatusoutput('aws s3 rm --region us-west-2 ' + url)
     if status is not 0:
         logging.getLogger(__name__).warning("Cannot remove file: " + url)
Пример #5
0
 def test_exception(self):
     x = 3
     self.assertRaises(RuntimeError,
                       glconnect.get_unity().eval_lambda, lambda y: x / y,
                       0)
     self.assertRaises(RuntimeError,
                       glconnect.get_unity().parallel_eval_lambda,
                       lambda y: x / y, [0 for i in range(10)])
Пример #6
0
 def _test_read_write_helper(self, url, content_expected):
     url = graphlab.util._make_internal_url(url)
     glconnect.get_unity().__write__(url, content_expected)
     content_read = glconnect.get_unity().__read__(url)
     self.assertEquals(content_read, content_expected)
     # clean up the file we wrote
     status, output = commands.getstatusoutput('hadoop fs -test -e ' + url)
     if status is 0:
         commands.getstatusoutput('hadoop fs -rm ' + url)
Пример #7
0
 def test_exception(self):
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....'))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....'))
     self.assertRaises(IOError, lambda: self.graph.save("/root/tmp.graph"))
     self.assertRaises(IOError, lambda: self.sframe.save("/root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: self.model.save("/root/tmp.model"))
     self.assertRaises(IOError, lambda: graphlab.load_graph("/root/tmp.graph"))
     self.assertRaises(IOError, lambda: graphlab.load_sframe("/root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: graphlab.load_model("/root/tmp.model"))
Пример #8
0
 def test_exception(self):
     self.assertRaises(ValueError, lambda: self._test_read_write_helper(self.tempfile, 'hello world'))
     self.assertRaises(ValueError, lambda: self._test_read_write_helper("local://" + self.tempfile + ".csv.gz", 'hello,world,woof'))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("remote:///root/tmp", '.....'))
     self.assertRaises(IOError, lambda: self.graph.save("remote:///root/tmp.graph"))
     self.assertRaises(IOError, lambda: self.sframe.save("remote:///root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: self.model.save("remote:///root/tmp.model"))
     self.assertRaises(IOError, lambda: graphlab.load_graph("remote:///root/tmp.graph"))
     self.assertRaises(IOError, lambda: graphlab.load_sframe("remote:///root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: graphlab.load_model("remote:///root/tmp.model"))
Пример #9
0
    def test_crash_recovery(self):
        ls = range(1000)

        def good_fun(x):
            return x

        def bad_fun(x):
            if (x % 251 == 0):
                cy_test_utils.force_exit_fun()  # this will force the worker process to exit
            return x
        self.assertRaises(RuntimeError, lambda: glconnect.get_unity().parallel_eval_lambda(lambda x: bad_fun(x), ls))
        glconnect.get_unity().parallel_eval_lambda(lambda x: good_fun(x), ls)
Пример #10
0
 def test_simple_evaluation(self):
     x = 3
     self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 0),
                      3)
     self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 1),
                      4)
     self.assertEqual(
         glconnect.get_unity().eval_lambda(lambda x: x.upper(), 'abc'),
         'ABC')
     self.assertEqual(
         glconnect.get_unity().eval_lambda(lambda x: x.lower(), 'ABC'),
         'abc')
     self.assertEqual(glconnect.get_unity().eval_lambda(fib, 1), 1)
Пример #11
0
 def test_exception(self):
     bad_url = "hdfs:///root/"
     if self.has_hdfs:
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent"))
         self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph"))
         self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx"))
         self.assertRaises(IOError, lambda: self.model.save(bad_url + "x.model"))
         self.assertRaises(IOError, lambda: graphlab.load_graph(bad_url + "mygraph"))
         self.assertRaises(IOError, lambda: graphlab.load_sframe(bad_url + "x.frame_idx"))
         self.assertRaises(IOError, lambda: graphlab.load_model(bad_url + "x.model"))
     else:
         logging.getLogger(__name__).info("No hdfs avaiable. Test pass.")
Пример #12
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')

        """
        # Save to a temoporary pickle file.
        temp_file = tempfile.mktemp()
        self._save_to_pickle(temp_file)

        # Write the pickle file to an OARC
        if not self.__proxy__:
            self.__proxy__ = _gl.extensions._PythonModel()

        # The proxy contains the file.
        self.__proxy__.temp_file = temp_file
        wrapper = self._get_wrapper()
        return glconnect.get_unity().save_model(self.__proxy__,
                          _make_internal_url(location), wrapper)
Пример #13
0
    def test_crash_recovery(self):
        ls = range(1000)

        def good_fun(x):
            return x

        def bad_fun(x):
            if (x % 251 == 0):
                cy_test_utils.force_exit_fun(
                )  # this will force the worker process to exit
            return x

        self.assertRaises(
            RuntimeError, lambda: glconnect.get_unity().parallel_eval_lambda(
                lambda x: bad_fun(x), ls))
        glconnect.get_unity().parallel_eval_lambda(lambda x: good_fun(x), ls)
Пример #14
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')

        """
        # Save to a temoporary pickle file.
        temp_file = tempfile.mktemp()
        self._save_to_pickle(temp_file)

        # Write the pickle file to an OARC
        if not self.__proxy__:
            self.__proxy__ = _gl.extensions._PythonModel()

        # The proxy contains the file.
        self.__proxy__.temp_file = temp_file
        wrapper = self._get_wrapper()
        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                wrapper)
Пример #15
0
 def __test_model_save_load_helper__(self, model):
     with util.TempDirectory() as f:
         model.save(f)
         m2 = get_unity().load_model(f)
         self.assertItemsEqual(model.list_fields(), m2.list_fields())
         for key in model.list_fields():
             if type(model.get(key)) is SGraph:
                 self.assertItemsEqual(
                     model.get(key).summary(),
                     m2.get(key).summary())
                 self.assertItemsEqual(
                     model.get(key).get_fields(),
                     m2.get(key).get_fields())
             elif type(model.get(key)) is SFrame:
                 sf1 = model.get(key)
                 sf2 = m2.get(key)
                 self.assertEqual(len(sf1), len(sf2))
                 self.assertItemsEqual(sf1.column_names(),
                                       sf2.column_names())
                 df1 = sf1.to_dataframe()
                 print df1
                 df2 = sf2.to_dataframe()
                 print df2
                 df1 = df1.set_index(df1.columns[0])
                 df2 = df2.set_index(df2.columns[0])
                 assert_frame_equal(df1, df2)
             else:
                 if (type(model.get(key)) is pd.DataFrame):
                     assert_frame_equal(model.get(key), m2.get(key))
                 else:
                     self.assertEqual(model.get(key), m2.get(key))
Пример #16
0
    def save(self, location):
        """
        Save the transformer into a GraphLab archive. The object is saved as a
        directory which can then be loaded using the
        :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        .. sourcecode:: python
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
        return glconnect.get_unity().save_model(self.__proxy__,
                             _make_internal_url(location), self._get_wrapper())
Пример #17
0
    def test_pagerank(self):
        if "pagerank" in get_unity().list_toolkit_functions():
            m = gl.pagerank.create(self.graph)
            print m
            m.summary()
            self.assertEqual((m.get('pagerank').num_rows(), m.get('pagerank').num_cols()),
                             (self.graph.summary()['num_vertices'], 3))
            self.assertAlmostEqual(m['pagerank']['pagerank'].sum(), 2727.5348, delta=1e-3)
            self.__test_model_save_load_helper__(m)

            m2 = gl.pagerank.create(self.graph, reset_probability=0.5)
            print m2
            self.assertEqual((m2.get('pagerank').num_rows(), m2.get('pagerank').num_cols()),
                             (self.graph.summary()['num_vertices'], 3))
            self.assertAlmostEqual(m2['pagerank']['pagerank'].sum(), 7087.0791, delta=1e-3)
            with self.assertRaises(Exception):
                assert_frame_equal(m.get('pagerank').topk('pagerank'), m2.get('pagerank').topk('pagerank'))
            self.__test_model_save_load_helper__(m2)

            default_options = gl.pagerank.get_default_options()
            print default_options
            self.assertTrue(len(default_options.keys()) == 3)
            self.assertTrue(default_options['reset_probability'] == 0.15)
            self.assertTrue(default_options['threshold'] == 1e-2)
            self.assertTrue(default_options['max_iterations'] == 20)

            current_options = m2.get_current_options()
            print current_options
            self.assertTrue(len(current_options.keys()) == 3)
            self.assertTrue(current_options['reset_probability'] == 0.5)
            self.assertTrue(current_options['threshold'] == 1e-2)
            self.assertTrue(current_options['max_iterations'] == 20)
Пример #18
0
    def save(self, location):
        """
        Save the transformer into a GraphLab archive. The object is saved as a
        directory which can then be loaded using the
        :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote
            URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        .. sourcecode:: python

            >>> model.save('my_model_file')
            >>> loaded_model = gl.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
Пример #19
0
 def __test_model_save_load_helper__(self, model):
     with util.TempDirectory() as f:
         model.save(f)
         m2 = get_unity().load_model(f)
         self.assertItemsEqual(model.list_fields(), m2.list_fields())
         for key in model.list_fields():
             if type(model.get(key)) is SGraph:
                 self.assertItemsEqual(model.get(key).summary(), m2.get(key).summary())
                 self.assertItemsEqual(model.get(key).get_fields(), m2.get(key).get_fields())
             elif type(model.get(key)) is SFrame:
                 sf1 = model.get(key)
                 sf2 = m2.get(key)
                 self.assertEqual(len(sf1), len(sf2))
                 self.assertItemsEqual(sf1.column_names(), sf2.column_names())
                 df1 = sf1.to_dataframe()
                 print df1
                 df2 = sf2.to_dataframe()
                 print df2
                 df1 = df1.set_index(df1.columns[0])
                 df2 = df2.set_index(df2.columns[0])
                 assert_frame_equal(df1, df2)
             else:
                 if (type(model.get(key)) is pd.DataFrame):
                     assert_frame_equal(model.get(key), m2.get(key))
                 else:
                     self.assertEqual(model.get(key), m2.get(key))
Пример #20
0
 def test_exception(self):
     if self.has_s3:
         bad_bucket = "i_am_a_bad_bucket"
         prefix = "s3://" + bad_bucket
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent"))
         self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph"))
         self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx"))
         self.assertRaises(IOError, lambda: self.model.save(prefix + "/x.model"))
         self.assertRaises(IOError, lambda: graphlab.load_graph(prefix + "/x.graph"))
         self.assertRaises(IOError, lambda: graphlab.load_sframe(prefix + "/x.frame_idx"))
         self.assertRaises(IOError, lambda: graphlab.load_model(prefix + "/x.model"))
     else:
         logging.getLogger(__name__).info("No s3 bucket avaiable. Test pass.")
Пример #21
0
def get_graphlab_object_type(url):
    '''
    Given url where a GraphLab Create object is persisted, return the GraphLab
    Create object type: 'model', 'graph', 'sframe', or 'sarray'
    '''
    ret = _glconnect.get_unity().get_graphlab_object_type(_make_internal_url(url))

    # to be consistent, we use sgraph instead of graph here
    if ret == 'graph':
        ret = 'sgraph'
    return ret
Пример #22
0
    def save(self, url):
        """
        Save the neuralnet to url.

        Parameters
        ----------
        url : str
            The URL to save the network.

        Examples
        --------
        >>> import graphlab as gl
        >>> net = gl.deeplearning.get_builtin_neuralnet('mnist')
        >>> net.save('mnist.conf')

        See Also
        --------
        graphlab.deeplearning.load
        """
        _gl_connect.get_unity().__write__(_make_internal_url(url), self.__config_str__())
Пример #23
0
def get_graphlab_object_type(url):
    '''
    Given url where a GraphLab Create object is persisted, return the GraphLab
    Create object type: 'model', 'graph', 'sframe', or 'sarray'
    '''
    ret = _glconnect.get_unity().get_graphlab_object_type(_make_internal_url(url))

    # to be consistent, we use sgraph instead of graph here
    if ret == 'graph':
        ret = 'sgraph'
    return ret
    def save(self, url):
        """
        Save the neuralnet to url.

        Parameters
        ----------
        url : str
            The URL to save the network.

        Examples
        --------
        >>> import graphlab as gl
        >>> net = gl.deeplearning.get_builtin_neuralnet('mnist')
        >>> net.save('mnist.conf')

        See Also
        --------
        graphlab.deeplearning.load
        """
        _gl_connect.get_unity().__write__(_make_internal_url(url),
                                          self.__config_str__())
Пример #25
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model

    # We need to fix this sometime, but here is the explanation of the stupid
    # check below:
    #
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    if (not file_util.get_protocol(location) in ['http', 'https']) and \
            (not file_util.exists(location +  '/dir_archive.ini')):
        # Not a ToolkitError so try unpickling the model.
        unpickler = gl_pickle.GLUnpickler(location)

        # Get the version
        version = unpickler.load()

        # Load the class name.
        cls_name = unpickler.load()
        cls = _get_class_from_name(cls_name)

        # Load the object with the right version.
        model = cls._load_version(unpickler, version)

        unpickler.close()

        # Return the model
        return model
    else:
        _internal_url = _make_internal_url(location)
        return glconnect.get_unity().load_model(_internal_url)
Пример #26
0
    def test_graph_coloring(self):
        if "graph_coloring" in get_unity().list_toolkit_functions():
            m = gl.graph_coloring.create(self.graph)
            print m
            m.summary()
            # coloring is non-deterministic, so we cannot verify the result here
            self.__test_model_save_load_helper__(m)

            default_options = gl.graph_coloring.get_default_options()
            self.assertTrue(len(default_options.keys()) == 0)

            current_options = m.get_current_options()
            self.assertTrue(len(current_options.keys()) == 0)
Пример #27
0
    def test_triangle_counting(self):
        if "triangle_counting" in get_unity().list_toolkit_functions():
            m = gl.triangle_counting.create(self.graph)
            print m
            m.summary()
            self.__test_model_save_load_helper__(m)
            self.assertEqual(m.get('num_triangles'), 934)

            default_options = gl.triangle_counting.get_default_options()
            self.assertTrue(len(default_options.keys()) == 0)

            current_options = m.get_current_options()
            self.assertTrue(len(current_options.keys()) == 0)
Пример #28
0
    def test_triangle_counting(self):
        if "triangle_counting" in get_unity().list_toolkit_functions():
            m = gl.triangle_counting.create(self.graph)
            print m
            m.summary()
            self.__test_model_save_load_helper__(m)
            self.assertEqual(m.get('num_triangles'), 934)

            default_options = gl.triangle_counting.get_default_options()
            self.assertTrue(len(default_options.keys()) == 0)

            current_options = m.get_current_options()
            self.assertTrue(len(current_options.keys()) == 0)
Пример #29
0
    def test_graph_coloring(self):
        if "graph_coloring" in get_unity().list_toolkit_functions():
            m = gl.graph_coloring.create(self.graph)
            print m
            m.summary()
            # coloring is non-deterministic, so we cannot verify the result here
            self.__test_model_save_load_helper__(m)

            default_options = gl.graph_coloring.get_default_options()
            self.assertTrue(len(default_options.keys()) == 0)

            current_options = m.get_current_options()
            self.assertTrue(len(current_options.keys()) == 0)
Пример #30
0
    def test_parallel_evaluation(self):
        xin = 33
        repeat = 8
        # execute the task bulk using one process to get a baseline
        start_time = time.time()
        glconnect.get_unity().eval_lambda(lambda x: [fib(i) for i in x], [xin for i in range(repeat)])
        single_thread_time = time.time() - start_time
        logging.info("Single thread lambda eval takes %s secs" % single_thread_time)

        # execute the task in parallel
        start_time = time.time()
        ans_list = glconnect.get_unity().parallel_eval_lambda(lambda x: fib(x), [xin for i in range(repeat)])
        multi_thread_time = time.time() - start_time
        logging.info("Multi thread lambda eval takes %s secs" % multi_thread_time)

        # test the speed up by running in parallel
        nproc = multiprocessing.cpu_count()
        if (nproc > 1 and multi_thread_time > (single_thread_time / 1.5)):
            logging.warning("Slow parallel processing: single thread takes %s secs, multithread on %s procs takes %s secs" % (single_thread_time, nproc, multi_thread_time))

        # test accuracy
        ans = fib(xin)
        for a in ans_list:
            self.assertEqual(a, ans)
Пример #31
0
    def test_connected_component(self):
        if "connected_component" in get_unity().list_toolkit_functions():
            m = gl.connected_components.create(self.graph)
            print m
            m.summary()
            print m.get('component_id')
            print m.get('component_size')
            self.assertEqual(m['component_size'].num_rows(), 1)
            self.__test_model_save_load_helper__(m)

            default_options = gl.connected_components.get_default_options()
            self.assertTrue(len(default_options.keys()) == 0)

            current_options = m.get_current_options()
            self.assertTrue(len(current_options.keys()) == 0)
Пример #32
0
    def test_connected_component(self):
        if "connected_component" in get_unity().list_toolkit_functions():
            m = gl.connected_components.create(self.graph)
            print m
            m.summary()
            print m.get('component_id')
            print m.get('component_size')
            self.assertEqual(m['component_size'].num_rows(), 1)
            self.__test_model_save_load_helper__(m)

            default_options = gl.connected_components.get_default_options()
            self.assertTrue(len(default_options.keys()) == 0)

            current_options = m.get_current_options()
            self.assertTrue(len(current_options.keys()) == 0)
Пример #33
0
def get_runtime_config():
    """
    Returns all the GraphLab Create configuration variables that can be set
    at runtime. See :py:func:`graphlab.set_runtime_config()` to set these
    values and for documentation on the effect of each variable.

    Parameters
    ----------
    None

    Returns
    -------
    Returns a dictionary of {key:value,..}
    """
    unity = _glconnect.get_unity()
    return unity.list_globals(True)
Пример #34
0
def get_runtime_config():
    """
    Returns all the GraphLab Create configuration variables that can be set
    at runtime. See :py:func:`graphlab.set_runtime_config()` to set these
    values and for documentation on the effect of each variable.

    Parameters
    ----------
    None

    Returns
    -------
    Returns a dictionary of {key:value,..}
    """
    unity = _glconnect.get_unity()
    return unity.list_globals(True)
Пример #35
0
    def test_shortest_path(self):
        if "sssp" in get_unity().list_toolkit_functions():
            m = gl.shortest_path.create(self.graph, source_vid=0)
            print m
            m.summary()
            self.__test_model_save_load_helper__(m)

            m2 = gl.shortest_path.create(self.graph, source_vid=0)
            print m2
            self.__test_model_save_load_helper__(m2)

            # Test get_path function on a simple chain graph and star graph
            chain_graph = gl.SGraph().add_edges(
                [gl.Edge(i, i + 1) for i in range(10)])
            m3 = gl.shortest_path.create(chain_graph, source_vid=0)
            for i in range(10):
                self.assertSequenceEqual(m3.get_path(i),
                                         [(j, float(j)) for j in range(i + 1)])

            star_graph = gl.SGraph().add_edges(
                [gl.Edge(0, i + 1) for i in range(10)])
            m4 = gl.shortest_path.create(star_graph, source_vid=0)
            for i in range(1, 11):
                self.assertSequenceEqual(m4.get_path(i), [(0, 0.0), (i, 1.0)])

            # Test sssp ignoring the existing distance field
            star_graph.vertices['distance'] = 0
            m5 = gl.shortest_path.create(star_graph, source_vid=0)
            for i in range(1, 11):
                self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)])

            default_options = gl.shortest_path.get_default_options()
            print default_options
            self.assertTrue(len(default_options.keys()) == 2)
            self.assertTrue(default_options['weight_field'] == "")
            self.assertTrue(default_options['max_distance'] == 1e30)

            m6 = gl.shortest_path.create(chain_graph,
                                         source_vid=0,
                                         max_distance=3)
            current_options = m6.get_current_options()
            print current_options
            self.assertTrue(len(current_options.keys()) == 2)
            self.assertTrue(current_options['weight_field'] == "")
            self.assertTrue(current_options['max_distance'] == 3)
Пример #36
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    try:
        _internal_url = _make_internal_url(location)
        return glconnect.get_unity().load_model(_internal_url)
    except Exception as e:
        if isinstance(e, ToolkitError):
            raise
        else:
            # Not a ToolkitError so try unpickling the model.
            unpickler = gl_pickle.GLUnpickler(location)

            # Get the version
            version = unpickler.load()

            # Load the class name.
            cls_name = unpickler.load()
            cls = _get_class_from_name(cls_name)

            # Load the object with the right version.
            model = cls._load_version(unpickler, version)

            unpickler.close()

            # Return the model
            return model
Пример #37
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Note that the diverse_sampler stores the data internally, so you can
        save the model, then load it later and sample from the loaded model
        immediately.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        .. sourcecode:: python 
            >>> ground_set = graphlab.SFrame({'id': [0, 1, 2],
                                              'feature_1': [3, 1, 2],
                                              'feature_2': [[0, 1], [0, 1], [1, 0]]})
            >>> sampler = graphlab.diversity.diverse_sampler.create(data=ground_set,
                                                                    item_id='id',
                                                                    quality_feature='feature_1',
                                                                    similarity_features=['feature_2'])
            >>> sampler.save('my_sampler')
            >>> loaded_sampler = graphlab.load_model('my_sampler')
            >>> loaded_sampler.sample(k=2)
            +-----------+------------+----+
            | feature_1 | feature_2  | id |
            +-----------+------------+----+
            |     2     | [0.0, 1.0] | 1  |
            |     1     | [1.0, 0.0] | 2  |
            +-----------+------------+----+

        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
Пример #38
0
  def save(self, location):
    """
    Save the model. The model is saved as a directory which can then be
    loaded using the :py:func:`~graphlab.load_model` method.

    Note that the diverse_sampler stores the data internally, so you can
    save the model, then load it later and sample from the loaded model
    immediately.

    Parameters
    ----------
    location : string
        Target destination for the model. Can be a local path or remote URL.

    See Also
    ----------
    graphlab.load_model

    Examples
    ----------
    .. sourcecode:: python 
        >>> ground_set = graphlab.SFrame({'id': [0, 1, 2],
                                          'feature_1': [3, 1, 2],
                                          'feature_2': [[0, 1], [0, 1], [1, 0]]})
        >>> sampler = graphlab.diversity.diverse_sampler.create(data=ground_set,
                                                                item_id='id',
                                                                quality_feature='feature_1',
                                                                similarity_features=['feature_2'])
        >>> sampler.save('my_sampler')
        >>> loaded_sampler = graphlab.load_model('my_sampler')
        >>> loaded_sampler.sample(k=2)
        +-----------+------------+----+
        | feature_1 | feature_2  | id |
        +-----------+------------+----+
        |     2     | [0.0, 1.0] | 1  |
        |     1     | [1.0, 0.0] | 2  |
        +-----------+------------+----+

    """
    _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
    return glconnect.get_unity().save_model(self.__proxy__,
                           _make_internal_url(location), self._get_wrapper())
Пример #39
0
    def save(self, location):
        """

        Parameters
        ----------
        location: str
            Filename.

        Returns
        -------
        out: None

        Examples
        --------

        """

        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
Пример #40
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    return glconnect.get_unity().load_model(_make_internal_url(location))
Пример #41
0
def load_model(location):
    """
    Load any graphlab model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    return glconnect.get_unity().load_model(make_internal_url(location))
Пример #42
0
    def test_shortest_path(self):
        if "sssp" in get_unity().list_toolkit_functions():
            m = gl.shortest_path.create(self.graph, source_vid=0)
            print m
            m.summary()
            self.__test_model_save_load_helper__(m)

            m2 = gl.shortest_path.create(self.graph, source_vid=0)
            print m2
            self.__test_model_save_load_helper__(m2)

            # Test get_path function on a simple chain graph and star graph
            chain_graph = gl.SGraph().add_edges([gl.Edge(i, i + 1) for i in range(10)])
            m3 = gl.shortest_path.create(chain_graph, source_vid=0)
            for i in range(10):
                self.assertSequenceEqual(m3.get_path(i), [(j, float(j)) for j in range(i + 1)])

            star_graph = gl.SGraph().add_edges([gl.Edge(0, i + 1) for i in range(10)])
            m4 = gl.shortest_path.create(star_graph, source_vid=0)
            for i in range(1, 11):
                self.assertSequenceEqual(m4.get_path(i), [(0, 0.0), (i, 1.0)])

            # Test sssp ignoring the existing distance field
            star_graph.vertices['distance'] = 0
            m5 = gl.shortest_path.create(star_graph, source_vid=0)
            for i in range(1, 11):
                self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)])

            default_options = gl.shortest_path.get_default_options()
            print default_options
            self.assertTrue(len(default_options.keys()) == 2)
            self.assertTrue(default_options['weight_field'] == "")
            self.assertTrue(default_options['max_distance'] == 1e30)

            m6 = gl.shortest_path.create(chain_graph, source_vid=0, max_distance=3)
            current_options = m6.get_current_options()
            print current_options
            self.assertTrue(len(current_options.keys()) == 2)
            self.assertTrue(current_options['weight_field'] == "")
            self.assertTrue(current_options['max_distance'] == 3)
Пример #43
0
    def test_pagerank(self):
        if "pagerank" in get_unity().list_toolkit_functions():
            m = gl.pagerank.create(self.graph)
            print m
            m.summary()
            self.assertEqual(
                (m.get('pagerank').num_rows(), m.get('pagerank').num_cols()),
                (self.graph.summary()['num_vertices'], 3))
            self.assertAlmostEqual(m['pagerank']['pagerank'].sum(),
                                   2727.5348,
                                   delta=1e-3)
            self.__test_model_save_load_helper__(m)

            m2 = gl.pagerank.create(self.graph, reset_probability=0.5)
            print m2
            self.assertEqual(
                (m2.get('pagerank').num_rows(), m2.get('pagerank').num_cols()),
                (self.graph.summary()['num_vertices'], 3))
            self.assertAlmostEqual(m2['pagerank']['pagerank'].sum(),
                                   7087.0791,
                                   delta=1e-3)
            with self.assertRaises(Exception):
                assert_frame_equal(
                    m.get('pagerank').topk('pagerank'),
                    m2.get('pagerank').topk('pagerank'))
            self.__test_model_save_load_helper__(m2)

            default_options = gl.pagerank.get_default_options()
            print default_options
            self.assertTrue(len(default_options.keys()) == 3)
            self.assertTrue(default_options['reset_probability'] == 0.15)
            self.assertTrue(default_options['threshold'] == 1e-2)
            self.assertTrue(default_options['max_iterations'] == 20)

            current_options = m2.get_current_options()
            print current_options
            self.assertTrue(len(current_options.keys()) == 3)
            self.assertTrue(current_options['reset_probability'] == 0.5)
            self.assertTrue(current_options['threshold'] == 1e-2)
            self.assertTrue(current_options['max_iterations'] == 20)
Пример #44
0
    def test_kcore(self):
        if "kcore" in get_unity().list_toolkit_functions():
            m = gl.kcore.create(self.graph)
            print m
            m.summary()
            biggest_core = m['core_id'].groupby('core_id', gl.aggregate.COUNT).topk('Count').head(1)
            self.assertEqual(biggest_core['core_id'][0], 6)
            self.assertEqual(biggest_core['Count'][0], 4492)
            self.__test_model_save_load_helper__(m)

            default_options = gl.kcore.get_default_options()
            print default_options
            self.assertTrue(len(default_options.keys()) == 2)
            self.assertTrue(default_options['kmin'] == 0)
            self.assertTrue(default_options['kmax'] == 10)

            m2 = gl.kcore.create(self.graph, kmin = 1, kmax = 5)
            current_options = m2.get_current_options()
            print current_options
            self.assertTrue(len(current_options.keys()) == 2)
            self.assertTrue(current_options['kmin'] == 1)
            self.assertTrue(current_options['kmax'] == 5)
Пример #45
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track('toolkit.model.save')
        return glconnect.get_unity().save_model(self, make_internal_url(location))
Пример #46
0
    def test_kcore(self):
        if "kcore" in get_unity().list_toolkit_functions():
            m = gl.kcore.create(self.graph)
            print m
            m.summary()
            biggest_core = m['core_id'].groupby(
                'core_id', gl.aggregate.COUNT).topk('Count').head(1)
            self.assertEqual(biggest_core['core_id'][0], 6)
            self.assertEqual(biggest_core['Count'][0], 4492)
            self.__test_model_save_load_helper__(m)

            default_options = gl.kcore.get_default_options()
            print default_options
            self.assertTrue(len(default_options.keys()) == 2)
            self.assertTrue(default_options['kmin'] == 0)
            self.assertTrue(default_options['kmax'] == 10)

            m2 = gl.kcore.create(self.graph, kmin=1, kmax=5)
            current_options = m2.get_current_options()
            print current_options
            self.assertTrue(len(current_options.keys()) == 2)
            self.assertTrue(current_options['kmin'] == 1)
            self.assertTrue(current_options['kmax'] == 5)
Пример #47
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = graphlab.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track('toolkit.model.save')
        return glconnect.get_unity().save_model(self,
                                                _make_internal_url(location))
Пример #48
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location: str
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        --------
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')
        """

        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
Пример #49
0
def get_environment_config():
    """
    Returns all the GraphLab Create configuration variables that can only
    be set via environment variables.

    - *GRAPHLAB_FILEIO_WRITER_BUFFER_SIZE*
      The file write buffer size.

    - *GRAPHLAB_FILEIO_READER_BUFFER_SIZE*
      The file read buffer size.

    - *OMP_NUM_THREADS*
      The maximum number of threads to use for parallel processing.

    Parameters
    ----------
    None

    Returns
    -------
    Returns a dictionary of {key:value,..}
    """
    unity = _glconnect.get_unity()
    return unity.list_globals(False)
Пример #50
0
def get_environment_config():
    """
    Returns all the GraphLab Create configuration variables that can only
    be set via environment variables.

    - *GRAPHLAB_FILEIO_WRITER_BUFFER_SIZE*
      The file write buffer size.

    - *GRAPHLAB_FILEIO_READER_BUFFER_SIZE*
      The file read buffer size.

    - *OMP_NUM_THREADS*
      The maximum number of threads to use for parallel processing.

    Parameters
    ----------
    None

    Returns
    -------
    Returns a dictionary of {key:value,..}
    """
    unity = _glconnect.get_unity()
    return unity.list_globals(False)
Пример #51
0
def set_runtime_config(name, value):
    """
    Sets a runtime configuration value. These configuration values are also
    read from environment variables at program startup if available. See
    :py:func:`graphlab.get_runtime_config()` to get the current values for
    each variable.

    The default configuration is conservatively defined for machines with about
    4-8GB of RAM.

    Note that defaults may change across GraphLab Create versions and the names
    of performance tuning constants may also change as improved algorithms
    are developed and implemented.

    **Basic Configuration Variables**

    - *GRAPHLAB_CACHE_FILE_LOCATIONS*
     The directory in which intermediate SFrames/SArray are stored.
     For instance "/var/tmp".  Multiple directories can be specified separated
     by a colon (ex: "/var/tmp:/tmp") in which case intermediate SFrames will
     be striped across both directories (useful for specifying multiple disks).
     Defaults to /var/tmp if the directory exists, /tmp otherwise.

    - *GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY*
     The maximum amount of memory which will be occupied by *all* intermediate
     SFrames/SArrays. Once this limit is exceeded, SFrames/SArrays will be
     flushed out to temporary storage (as specified by
     `GRAPHLAB_CACHE_FILE_LOCATIONS`). On large systems increasing this as well
     as `GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE` can improve performance
     significantly. Defaults to 2147483648 bytes (2GB).

    - *GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE*
     The maximum amount of memory which will be occupied by any individual
     intermediate SFrame/SArray. Once this limit is exceeded, the
     SFrame/SArray will be flushed out to temporary storage (as specified by
     `GRAPHLAB_CACHE_FILE_LOCATIONS`). On large systems, increasing this as well
     as `GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY` can improve performance
     significantly for large SFrames. Defaults to 134217728 bytes (128MB).

    **ODBC Configuration**

    - *GRAPHLAB_LIBODBC_PREFIX*
     A directory containing libodbc.so. Also see :func:`graphlab.set_libodbc_path`
     and :func:`graphlab.connect_odbc`

    - *GRAPHLAB_ODBC_BUFFER_MAX_ROWS*
     The number of rows to read from ODBC in each batch. Increasing this
     may give better performance at increased memory consumption. Defaults to
     2000.

    - *GRAPHLAB_ODBC_BUFFER_SIZE*
     The maximum ODBC buffer size in bytes when reading. Increasing this may
     give better performance at increased memory consumption. Defaults to 3GB.

    **Sort Performance Configuration**

    - *GRAPHLAB_SFRAME_SORT_PIVOT_ESTIMATION_SAMPLE_SIZE*
     The number of random rows to sample from the SFrame to estimate the
     sort pivots used to partition the sort. Defaults to 2000000.

    - *GRAPHLAB_SFRAME_SORT_BUFFER_SIZE*
     The maximum estimated memory consumption sort is allowed to use. Increasing
     this will increase the size of each sort partition, and will increase
     performance with increased memory consumption. Defaults to 2GB.

    **Join Performance Configuration**

    - *GRAPHLAB_SFRAME_JOIN_BUFFER_NUM_CELLS*
     The maximum number of cells to buffer in memory. Increasing this will
     increase the size of each join partition and will increase performance
     with increased memory consumption.
     If you have very large cells (very long strings for instance),
     decreasing this value will help decrease memory consumption.
     Defaults to 52428800.

    **Groupby Aggregate Performance Configuration**

    - *GRAPHLAB_SFRAME_GROUPBY_BUFFER_NUM_ROWS*
     The number of groupby keys cached in memory. Increasing this will increase
     performance with increased memory consumption. Defaults to 1048576.

    **Advanced Configuration Variables**

    - *GRAPHLAB_SFRAME_FILE_HANDLE_POOL_SIZE*
     The maximum number of file handles to use when reading SFrames/SArrays.
     Once this limit is exceeded, file handles will be recycled, reducing
     performance. This limit should be rarely approached by most SFrame/SArray
     operations. Large SGraphs however may create a large a number of SFrames
     in which case increasing this limit may improve performance (You may
     also need to increase the system file handle limit with "ulimit -n").
     Defaults to 128.

    ----------
    name: A string referring to runtime configuration variable.

    value: The value to set the variable to.

    Returns
    -------
    Nothing

    Raises
    ------
    A RuntimeError if the key does not exist, or if the value cannot be
    changed to the requested value.

    """
    unity = _glconnect.get_unity()
    ret = unity.set_global(name, value)
    if ret != "":
        raise RuntimeError(ret)
Пример #52
0
 def test_exception(self):
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(self.url, '.....'))