def train(fpath): df = pd.read_csv(fpath) df = df.drop(['DateTime'], axis=1) df.SubId = np.object_(np.int64(df.SubId)) df.UserId = np.object_(df.UserId) df.Rating = np.int64(df.Rating) temp = df.UserId.value_counts()[df.UserId.value_counts() < 10].index temp = set(temp) remain = [] for i in df.index: if df.UserId[i] not in temp: remain.append(i) df = df.loc[remain] sf = gl.SFrame(df) print 'finished reading in data' training, test = gl.recommender.util.random_split_by_user(sf, user_id='UserId', item_id='SubId', item_test_proportion=0.2, random_seed=1234) rcmder = gl.recommender.factorization_recommender.create(training, user_id='UserId', item_id='SubId', target='Rating', regularization=1e-5) print 'finished training model' print rcmder.evaluate(test, target='Rating') return rcmder
def train(fpath): df = pd.read_csv(fpath) df = df.drop(['DateTime'], axis=1) df.SubId = np.object_(np.int64(df.SubId)) df.UserId = np.object_(df.UserId) df.Rating = np.int64(df.Rating) temp = df.UserId.value_counts()[df.UserId.value_counts() < 10].index temp = set(temp) remain = [] for i in df.index: if df.UserId[i] not in temp: remain.append(i) df = df.loc[remain] sf = gl.SFrame(df) print 'finished reading in data' dataset, test = gl.recommender.util.random_split_by_user(sf, user_id='UserId', item_id='SubId', item_test_proportion=0.2, random_seed=2345) training, validate = gl.recommender.util.random_split_by_user(dataset, user_id='UserId', item_id='SubId', item_test_proportion=0.25, random_seed=3456) stype = ['jaccard', 'cosine', 'pearson'] thres = [10 ** e for e in range(-8, 1)] res = {} min_rmse = 99999.0 coor_min_rmse = (stype[0], thres[0]) for j in stype: for i in thres: rcmder = gl.recommender.item_similarity_recommender.create(training, user_id='UserId', item_id='SubId', target='Rating', threshold=i, similarity_type=j) res[(j, i)] = rcmder.evaluate(validate, metric='rmse', target='Rating')['rmse_overall'] if res[(j, i)] < min_rmse: min_rmse = res[(j, i)] coor_min_rmse = (j, i) print res print 'best combination is {} with RMSE {}'.format(coor_min_rmse, min_rmse) rcmder = gl.recommender.item_similarity_recommender.create(dataset, user_id='UserId', item_id='SubId', target='Rating', threshold=coor_min_rmse[1], similarity_type=coor_min_rmse[0]) print 'finished training model' print rcmder.evaluate(test, metric='rmse', target='Rating') return rcmder
def test_for_object_scalar_creation(self, level=rlevel): """Ticket #816""" a = np.object_() b = np.object_(3) b2 = np.object_(3.0) c = np.object_([4,5]) d = np.object_([None, {}, []]) assert a is None assert type(b) is int assert type(b2) is float assert type(c) is np.ndarray assert c.dtype == object assert d.dtype == object
def test_for_object_scalar_creation(self): import numpy as np import sys a = np.object_() b = np.object_(3) b2 = np.object_(3.0) c = np.object_([4, 5]) d = np.array([None])[0] assert a is None assert type(b) is int assert type(b2) is float assert type(c) is np.ndarray assert c.dtype == object assert type(d) is type(None) if '__pypy__' in sys.builtin_module_names: skip('not implemented yet') e = np.object_([None, {}, []]) assert e.dtype == object
def test_isscalar_numpy_array_scalars(self): self.assertTrue(is_scalar(np.int64(1))) self.assertTrue(is_scalar(np.float64(1.))) self.assertTrue(is_scalar(np.int32(1))) self.assertTrue(is_scalar(np.object_('foobar'))) self.assertTrue(is_scalar(np.str_('foobar'))) self.assertTrue(is_scalar(np.unicode_(u('foobar')))) self.assertTrue(is_scalar(np.bytes_(b'foobar'))) self.assertTrue(is_scalar(np.datetime64('2014-01-01'))) self.assertTrue(is_scalar(np.timedelta64(1, 'h')))
def test_isscalar_numpy_array_scalars(self): self.assertTrue(lib.isscalar(np.int64(1))) self.assertTrue(lib.isscalar(np.float64(1.0))) self.assertTrue(lib.isscalar(np.int32(1))) self.assertTrue(lib.isscalar(np.object_("foobar"))) self.assertTrue(lib.isscalar(np.str_("foobar"))) self.assertTrue(lib.isscalar(np.unicode_(u("foobar")))) self.assertTrue(lib.isscalar(np.bytes_(b"foobar"))) self.assertTrue(lib.isscalar(np.datetime64("2014-01-01"))) self.assertTrue(lib.isscalar(np.timedelta64(1, "h")))
def test_generic_roundtrip(self): values = [ np.int_(1), np.int32(-2), np.float_(2.5), np.nan, -np.inf, np.inf, np.datetime64('2014-01-01'), np.str_('foo'), np.unicode_('bar'), np.object_({'a': 'b'}), np.complex_(1 - 2j) ] for value in values: decoded = self.roundtrip(value) assert_equal(decoded, value) self.assertTrue(isinstance(decoded, type(value)))
def test_generic_roundtrip(): values = [ np.int_(1), np.int32(-2), np.float_(2.5), np.nan, -np.inf, np.inf, np.datetime64('2014-01-01'), np.str_('foo'), np.unicode_('bar'), np.object_({'a': 'b'}), np.complex_(1 - 2j), ] for value in values: decoded = roundtrip(value) assert_equal(decoded, value) assert isinstance(decoded, type(value))
def test_generic_roundtrip(self): if self.should_skip: return self.skip('numpy is not importable') values = [ np.int_(1), np.int32(-2), np.float_(2.5), np.nan, -np.inf, np.inf, np.datetime64('2014-01-01'), np.str_('foo'), np.unicode_('bar'), np.object_({'a': 'b'}), np.complex_(1 - 2j), ] for value in values: decoded = self.roundtrip(value) assert_equal(decoded, value) self.assertTrue(isinstance(decoded, type(value)))
def test_generic_roundtrip(self): if self.should_skip: return self.skip("numpy is not importable") values = [ np.int_(1), np.int32(-2), np.float_(2.5), np.nan, -np.inf, np.inf, np.datetime64("2014-01-01"), np.str_("foo"), np.unicode_("bar"), np.object_({"a": "b"}), np.complex_(1 - 2j), ] for value in values: decoded = self.roundtrip(value) assert_equal(decoded, value) self.assertTrue(isinstance(decoded, type(value)))
" - Enter for yes, continue\n" " - n then Enter for no, abort\n" ">>> ".format(muteswanfile.name)) if ohgoonthen == "n": quit() birdtable = [row for row in birdreader] if birdtable[-1][0] != "END": ohgoonthen = input("END row not found ({} may be malformed), continue?\n" " - Enter for yes, continue\n" " - n then Enter for no, abort\n" ">>> ".format(birdfile.name)) if ohgoonthen == "n": quit() # Extract column headers (and prints them, to make sure they're what you expect) muteswanheader = np.object_(muteswantable[0]) print("\n" + str(muteswanheader), "\n") birdheader = np.object_(birdtable[0]) print("\n" + str(birdheader), "\n") # Extract body of the data table, to be indexed by [row, column] (here removes headers and "END" row) muteswandata = np.object_(muteswantable[1:-1]) birddata = np.object_(birdtable[1:-1]) # Extract data of location column muteswanloc = muteswandata[:, 0] birdloc = birddata[:, 0] # Extract population data, to be indexed by [site, year] muteswanpop = muteswandata[:, 1:-5] birdpop = birddata[:, 1:-5] # Function that finds all the strings containing non-numeric characters, vectorised for use on the data nonnumericmatcher = np.vectorize(lambda x: bool(re.compile("[^0-9]").search(x)))
def assert_equal_matlab_format(a, b): # Compares a and b for equality. b is always the original. If they # are dictionaries, a must be a structured ndarray and they must # have the same set of keys, after which they values must all be # compared. If they are a collection type (list, tuple, set, # frozenset, or deque), then the compairison must be made with b # converted to an object array. If the original is not a numpy type # (isn't or doesn't inherit from np.generic or np.ndarray), then it # is a matter of converting it to the appropriate numpy # type. Otherwise, both are supposed to be numpy types. For object # arrays, each element must be iterated over to be compared. Then, # if it isn't a string type, then they must have the same dtype, # shape, and all elements. All strings are converted to numpy.str_ # on read. If it is empty, it has shape (1, 0). A numpy.str_ has all # of its strings per row compacted together. A numpy.bytes_ string # has to have the same thing done, but then it needs to be converted # up to UTF-32 and to numpy.str_ through uint32. # # In all cases, we expect things to be at least two dimensional # arrays. if type(b) == dict: assert type(a) == np.ndarray assert a.dtype.names is not None assert set(a.dtype.names) == set(b.keys()) for k in b: assert_equal_matlab_format(a[k][0], b[k]) elif type(b) in (list, tuple, set, frozenset, collections.deque): assert_equal_matlab_format(a, np.object_(list(b))) elif not isinstance(b, (np.generic, np.ndarray)): if b is None: # It should be np.zeros(shape=(0, 1), dtype='float64')) assert type(a) == np.ndarray assert a.dtype == np.dtype('float64') assert a.shape == (1, 0) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, (bytes, str, bytearray))) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, (bytes, unicode, bytearray))): if len(b) == 0: assert_equal(a, np.zeros(shape=(1, 0), dtype='U')) elif isinstance(b, (bytes, bytearray)): assert_equal(a, np.atleast_2d(np.unicode_(b.decode()))) else: assert_equal(a, np.atleast_2d(np.unicode_(b))) else: assert_equal(a, np.atleast_2d(np.array(b))) else: if b.dtype.name != 'object': if b.dtype.char in ('U', 'S'): if len(b) == 0 and (b.shape == tuple() \ or b.shape == (0, )): assert_equal(a, np.zeros(shape=(1, 0), dtype='U')) elif b.dtype.char == 'U': c = np.atleast_1d(b) c = np.atleast_2d(c.view(np.dtype('U' \ + str(c.shape[-1]*c.dtype.itemsize//4)))) assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) elif b.dtype.char == 'S': c = np.atleast_1d(b) c = c.view(np.dtype('S' \ + str(c.shape[-1]*c.dtype.itemsize))) c = np.uint32(c.view(np.dtype('uint8'))) c = c.view(np.dtype('U' + str(c.shape[-1]))) c = np.atleast_2d(c) assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) pass else: c = np.atleast_2d(b) assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) else: c = np.atleast_2d(b) # An empty complex number gets turned into a real # number when it is stored. if np.prod(c.shape) == 0 \ and b.dtype.name.startswith('complex'): c = np.real(c) # If it is structured, check that the field names are # the same, in the same order, and then go through them # one by one. Otherwise, make sure the dtypes and shapes # are the same before comparing all values. if b.dtype.names is None and a.dtype.names is None: assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) else: assert a.dtype.names is not None assert b.dtype.names is not None assert set(a.dtype.names) == set(b.dtype.names) assert a.dtype.names == b.dtype.names a = a.flatten() b = b.flatten() for k in b.dtype.names: for index, x in np.ndenumerate(a): assert_equal_from_matlab(a[k][index], b[k][index]) else: c = np.atleast_2d(b) assert a.dtype == c.dtype assert a.shape == c.shape for index, x in np.ndenumerate(a): assert_equal_matlab_format(a[index], c[index])
def assert_equal_none_format(a, b): # Compares a and b for equality. b is always the original. If they # are dictionaries, a must be a structured ndarray and they must # have the same set of keys, after which they values must all be # compared. If they are a collection type (list, tuple, set, # frozenset, or deque), then the compairison must be made with b # converted to an object array. If the original is not a numpy type # (isn't or doesn't inherit from np.generic or np.ndarray), then it # is a matter of converting it to the appropriate numpy # type. Otherwise, both are supposed to be numpy types. For object # arrays, each element must be iterated over to be compared. Then, # if it isn't a string type, then they must have the same dtype, # shape, and all elements. If it is an empty string, then it would # have been stored as just a null byte (recurse to do that # comparison). If it is a bytes_ type, the dtype, shape, and # elements must all be the same. If it is string_ type, we must # convert to uint32 and then everything can be compared. if type(b) == dict: assert type(a) == np.ndarray assert a.dtype.names is not None assert set(a.dtype.names) == set(b.keys()) for k in b: assert_equal_none_format(a[k][0], b[k]) elif type(b) in (list, tuple, set, frozenset, collections.deque): assert_equal_none_format(a, np.object_(list(b))) elif not isinstance(b, (np.generic, np.ndarray)): if b is None: # It should be np.float64([]) assert type(a) == np.ndarray assert a.dtype == np.float64([]).dtype assert a.shape == (0, ) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, (bytes, bytearray))) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, (bytes, bytearray))): assert a == np.bytes_(b) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, str)) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, unicode)): assert_equal_none_format(a, np.unicode_(b)) else: assert_equal_none_format(a, np.array(b)[()]) else: if b.dtype.name != 'object': if b.dtype.char in ('U', 'S'): if b.dtype.char == 'S' and b.shape == tuple() \ and len(b) == 0: assert_equal(a, \ np.zeros(shape=tuple(), dtype=b.dtype.char)) elif b.dtype.char == 'U': if b.shape == tuple() and len(b) == 0: c = np.uint32(()) else: c = np.atleast_1d(b).view(np.uint32) assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) else: assert a.dtype == b.dtype assert a.shape == b.shape npt.assert_equal(a, b) else: assert a.dtype == b.dtype # Now, if b.shape is just all ones, then a.shape will # just be (1,). Otherwise, we need to compare the shapes # directly. Also, dimensions need to be squeezed before # comparison in this case. assert np.prod(a.shape) == np.prod(b.shape) assert a.shape == b.shape \ or (np.prod(b.shape) == 1 and a.shape == (1,)) if np.prod(a.shape) == 1: a = np.squeeze(a) b = np.squeeze(b) npt.assert_equal(a, b) else: assert a.dtype == b.dtype assert a.shape == b.shape for index, x in np.ndenumerate(a): assert_equal_none_format(a[index], b[index])
def assert_equal_none_format(a, b, options=None): # Compares a and b for equality. b is always the original. If they # are dictionaries, a must be a structured ndarray and they must # have the same set of keys, after which they values must all be # compared. If they are a collection type (list, tuple, set, # frozenset, or deque), then the compairison must be made with b # converted to an object array. If the original is not a numpy type # (isn't or doesn't inherit from np.generic or np.ndarray), then it # is a matter of converting it to the appropriate numpy # type. Otherwise, both are supposed to be numpy types. For object # arrays, each element must be iterated over to be compared. Then, # if it isn't a string type, then they must have the same dtype, # shape, and all elements. If it is an empty string, then it would # have been stored as just a null byte (recurse to do that # comparison). If it is a bytes_ type, the dtype, shape, and # elements must all be the same. If it is string_ type, we must # convert to uint32 and then everything can be compared. Big longs # and ints get written as numpy.bytes_. if type(b) == dict or (sys.hexversion >= 0x2070000 and type(b) == collections.OrderedDict): assert type(a) == np.ndarray assert a.dtype.names is not None # Determine if any of the keys could not be stored as str. If # they all can be, then the dtype field names should be the # keys. Otherwise, they should be 'keys' and 'values'. all_str_keys = True if sys.hexversion >= 0x03000000: tp_str = str tp_bytes = bytes converters = {tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: str(x)} tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x) else: tp_str = unicode tp_bytes = str converters = {tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: unicode(x)} tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x).encode('UTF-8') tps = tuple(converters.keys()) for k in b.keys(): if type(k) not in tps: all_str_keys = False break try: k_str = tp_conv(k) except: all_str_keys = False break if all_str_keys: assert set(a.dtype.names) == set([tp_conv_str(k) for k in b.keys()]) for k in b: assert_equal_none_format(a[tp_conv_str(k)][0], b[k], options) else: names = (options.dict_like_keys_name, options.dict_like_values_name) assert set(a.dtype.names) == set(names) keys = a[names[0]] values = a[names[1]] assert_equal_none_format(keys, tuple(b.keys()), options) assert_equal_none_format(values, tuple(b.values()), options) elif type(b) in (list, tuple, set, frozenset, collections.deque): assert_equal_none_format(a, np.object_(list(b)), options) elif not isinstance(b, (np.generic, np.ndarray)): if b is None: # It should be np.float64([]) assert type(a) == np.ndarray assert a.dtype == np.float64([]).dtype assert a.shape == (0, ) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, (bytes, bytearray))) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, (bytes, bytearray))): assert a == np.bytes_(b) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, str)) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, unicode)): assert_equal_none_format(a, np.unicode_(b), options) elif (sys.hexversion >= 0x03000000 \ and type(b) == int) \ or (sys.hexversion < 0x03000000 \ and type(b) == long): if b > 2**63 or b < -(2**63 - 1): assert_equal_none_format(a, np.bytes_(b), options) else: assert_equal_none_format(a, np.int64(b), options) else: assert_equal_none_format(a, np.array(b)[()], options) else: if b.dtype.name != 'object': if b.dtype.char in ('U', 'S'): if b.dtype.char == 'S' and b.shape == tuple() \ and len(b) == 0: assert_equal(a, \ np.zeros(shape=tuple(), dtype=b.dtype.char), \ options) elif b.dtype.char == 'U': if b.shape == tuple() and len(b) == 0: c = np.uint32(()) else: c = np.atleast_1d(b).view(np.uint32) assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) else: assert a.dtype == b.dtype assert a.shape == b.shape npt.assert_equal(a, b) else: # Now, if b.shape is just all ones, then a.shape will # just be (1,). Otherwise, we need to compare the shapes # directly. Also, dimensions need to be squeezed before # comparison in this case. assert np.prod(a.shape) == np.prod(b.shape) assert a.shape == b.shape \ or (np.prod(b.shape) == 1 and a.shape == (1,)) if np.prod(a.shape) == 1: a = np.squeeze(a) b = np.squeeze(b) # If there was a null in the dtype, then it was written # as a Group so the field order could have changed. if '\\x00' in str(b.dtype): assert set(a.dtype.descr) == set(b.dtype.descr) # Reorder the fields of a. c = np.empty(shape=b.shape, dtype=b.dtype) for n in b.dtype.names: c[n] = a[n] else: c = a assert c.dtype == b.dtype with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) npt.assert_equal(c, b) else: assert a.dtype == b.dtype assert a.shape == b.shape for index, x in np.ndenumerate(a): assert_equal_none_format(a[index], b[index], options)
def assert_equal_matlab_format(a, b, options=None): # Compares a and b for equality. b is always the original. If they # are dictionaries, a must be a structured ndarray and they must # have the same set of keys, after which they values must all be # compared. If they are a collection type (list, tuple, set, # frozenset, or deque), then the compairison must be made with b # converted to an object array. If the original is not a numpy type # (isn't or doesn't inherit from np.generic or np.ndarray), then it # is a matter of converting it to the appropriate numpy # type. Otherwise, both are supposed to be numpy types. For object # arrays, each element must be iterated over to be compared. Then, # if it isn't a string type, then they must have the same dtype, # shape, and all elements. All strings are converted to numpy.str_ # on read unless they were stored as a numpy.bytes_ due to having # non-ASCII characters. If it is empty, it has shape (1, 0). A # numpy.str_ has all of its strings per row compacted together. A # numpy.bytes_ string has to have the same thing done, but then it # needs to be converted up to UTF-32 and to numpy.str_ through # uint32. Big longs and ints end up getting converted to UTF-16 # uint16's when written and read back as UTF-32 numpy.unicode_. # # In all cases, we expect things to be at least two dimensional # arrays. if type(b) == dict or (sys.hexversion >= 0x2070000 and type(b) == collections.OrderedDict): assert type(a) == np.ndarray assert a.dtype.names is not None # Determine if any of the keys could not be stored as str. If # they all can be, then the dtype field names should be the # keys. Otherwise, they should be 'keys' and 'values'. all_str_keys = True if sys.hexversion >= 0x03000000: tp_str = str tp_bytes = bytes converters = {tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: str(x)} tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x) else: tp_str = unicode tp_bytes = str converters = {tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: unicode(x)} tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x).encode('UTF-8') tps = tuple(converters.keys()) for k in b.keys(): if type(k) not in tps: all_str_keys = False break try: k_str = tp_conv(k) except: all_str_keys = False break if all_str_keys: assert set(a.dtype.names) == set([tp_conv_str(k) for k in b.keys()]) for k in b: assert_equal_matlab_format(a[tp_conv_str(k)][0], b[k], options) else: names = (options.dict_like_keys_name, options.dict_like_values_name) assert set(a.dtype.names) == set(names) keys = a[names[0]][0] values = a[names[1]][0] assert_equal_matlab_format(keys, tuple(b.keys()), options) assert_equal_matlab_format(values, tuple(b.values()), options) elif type(b) in (list, tuple, set, frozenset, collections.deque): assert_equal_matlab_format(a, np.object_(list(b)), options) elif not isinstance(b, (np.generic, np.ndarray)): if b is None: # It should be np.zeros(shape=(0, 1), dtype='float64')) assert type(a) == np.ndarray assert a.dtype == np.dtype('float64') assert a.shape == (1, 0) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, (bytes, str, bytearray))) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, (bytes, unicode, bytearray))): if len(b) == 0: assert_equal(a, np.zeros(shape=(1, 0), dtype='U'), options) elif isinstance(b, (bytes, bytearray)): try: c = np.unicode_(b.decode('ASCII')) except: c = np.bytes_(b) assert_equal(a, np.atleast_2d(c), options) else: assert_equal(a, np.atleast_2d(np.unicode_(b)), options) elif (sys.hexversion >= 0x03000000 \ and type(b) == int) \ or (sys.hexversion < 0x03000000 \ and type(b) == long): if b > 2**63 or b < -(2**63 - 1): assert_equal(a, np.atleast_2d(np.unicode_(b)), options) else: assert_equal(a, np.atleast_2d(np.int64(b)), options) else: assert_equal(a, np.atleast_2d(np.array(b)), options) else: if b.dtype.name != 'object': if b.dtype.char in ('U', 'S'): if len(b) == 0 and (b.shape == tuple() \ or b.shape == (0, )): assert_equal(a, np.zeros(shape=(1, 0), dtype='U'), options) elif b.dtype.char == 'U': c = np.atleast_1d(b) c = np.atleast_2d(c.view(np.dtype('U' \ + str(c.shape[-1]*c.dtype.itemsize//4)))) assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) elif b.dtype.char == 'S': c = np.atleast_1d(b).view(np.ndarray) if np.all(c.view(np.uint8) < 128): c = c.view(np.dtype('S' \ + str(c.shape[-1]*c.dtype.itemsize))) c = c.view(np.dtype('uint8')) c = np.uint32(c.view(np.dtype('uint8'))) c = c.view(np.dtype('U' + str(c.shape[-1]))) c = np.atleast_2d(c) assert a.dtype == c.dtype assert a.shape == c.shape npt.assert_equal(a, c) pass else: c = np.atleast_2d(b) assert a.dtype == c.dtype assert a.shape == c.shape with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) npt.assert_equal(a, c) else: c = np.atleast_2d(b) # An empty complex number gets turned into a real # number when it is stored. if np.prod(c.shape) == 0 \ and b.dtype.name.startswith('complex'): c = np.real(c) # If it is structured, check that the field names are # the same, in the same order, and then go through them # one by one. Otherwise, make sure the dtypes and shapes # are the same before comparing all values. if b.dtype.names is None and a.dtype.names is None: assert a.dtype == c.dtype assert a.shape == c.shape with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) npt.assert_equal(a, c) else: assert a.dtype.names is not None assert b.dtype.names is not None assert set(a.dtype.names) == set(b.dtype.names) # The ordering of fields must be preserved if the # MATLAB_fields attribute could be used, which can # only be done if there are no non-ascii characters # in any of the field names. if sys.hexversion >= 0x03000000: allfields = ''.join(b.dtype.names) else: allfields = unicode('').join( \ [nm.decode('UTF-8') \ for nm in b.dtype.names]) if np.all(np.array([ord(ch) < 128 \ for ch in allfields])): assert a.dtype.names == b.dtype.names a = a.flatten() b = b.flatten() for k in b.dtype.names: for index, x in np.ndenumerate(a): assert_equal_from_matlab(a[k][index], b[k][index], options) else: c = np.atleast_2d(b) assert a.dtype == c.dtype assert a.shape == c.shape for index, x in np.ndenumerate(a): assert_equal_matlab_format(a[index], c[index], options)
def vcf2numpy(filename, return_mat=True, return_taxa=True, return_vrnt_chrgrp=True, return_vrnt_phypos=True, return_vrnt_name=True): """ Extract information from a vcf file. Parameters ---------- filename : str String indicating VCF file name. return_mat : bool Whether to return the genotype matrix. The genotype matrix is formatted as (m x n x p) where m is the number of chromosome phases (2, diploid for almost all cases), n is the number of taxa/individuals, p is the number of genomic markers/variants. return_taxa : bool Whether to return the taxa/individual name array. return_vrnt_chrgrp : bool Whether to return the variant chromosome number array. This is the chromosome number the marker/variant is assigned. return_vrnt_phypos : bool Whether to return the variant chromosome physical position array. This is the physical position on the assigned chromosome for the marker/variant. return_vrnt_name : bool Whether to return the variant name array. Returns ------- out : dict A dictionary containing the desired data types. Possible fields are: Field | Data type | Description --------------|---------------|---------------- "mat" | numpy.int8 | genotype matrix "taxa" | numpy.object_ | taxa/individual name array "vrnt_chrgrp" | numpy.int64 | variant chromosome number array "vrnt_phypos" | numpy.int64 | variant chromosome physical position array "vrnt_name" | numpy.object_ | variant name array """ # make VCF iterator vcf = cyvcf2.VCF(fname) # extract taxa names from vcf header taxa = vcf.samples # make empty lists to store extracted values mat = [] vrnt_chrgrp = [] vrnt_phypos = [] vrnt_name = [] # iterate through VCF file and accumulate variants for variant in vcf: if return_vrnt_chrgrp: # append chromosome integer vrnt_chrgrp.append(int(variant.CHROM)) if return_vrnt_phypos: # append variant position coordinates vrnt_phypos.append(variant.POS) if return_vrnt_name: # append marker name vrnt_name.append(str(variant.ID)) if return_mat: # extract allele states + whether they are phased or not phases = numpy.int8(variant.genotypes) # append genotype states mat.append(phases[:, 0:2].copy()) # construct a dictionary of values out_dict = {} if return_mat: out_dict["mat"] = numpy.int8(mat).transpose( 2, 1, 0) # convert and transpose genotype matrix if return_taxa: out_dict["taxa"] = numpy.object_(taxa) # convert to object array if return_vrnt_chrgrp: out_dict["vrnt_chrgrp"] = numpy.int64( vrnt_chrgrp) # convert to int64 array if return_vrnt_phypos: out_dict["vrnt_phypos"] = numpy.int64( vrnt_phypos) # convert to int64 array if return_vrnt_name: out_dict["vrnt_name"] = numpy.object_( vrnt_name) # convert to object array # return output dictionary return out_dict
# For Done, 0 represents incomplete, 1 represents complete for count check, lasers check and cover check coldefaults = {"Frame": np.arange(capn), **{col: np.nan for col in dfenv.columns}, "ScaleOK": -1, **{i: np.nan for i in scalestats}, **{name: 0 for name in faunanames}, **{covertype: 0 for covertype in covertypes}, "Done": 0, "LastEdited": "nan"} try: # Filenames are of format "AllData-Z.csv", where Z is the version nr csvnames = sorted(glob("AllData-*.csv"), key=getfileno) if len(csvnames) == 0: csvinname = "Matching Files" raise FileNotFoundError csvinname = csvnames[-1] dfout = pd.read_csv(csvinname) csvoutname = "AllData-{}.csv".format(getfileno(csvinname) + 1) coldefaultskeys = np.object_(list(coldefaults.keys())) colsinmask = np.isin(coldefaultskeys, dfout.columns) if not np.all(colsinmask): print(coldefaultskeys[~colsinmask], "not found, adding") dfout = dfout.assign(**{key: coldefaults[key] for key in coldefaultskeys[~colsinmask]}) print("Loaded in", csvinname) except FileNotFoundError: csvoutname = "AllData-0.csv" print("No {} found, creating new DataFrame".format(csvinname)) print("{} frames displayed and in".format(np.ceil(capn / skipspeed)), csvoutname) dfout = pd.DataFrame(coldefaults) dataoutindex = np.argwhere(dfout["Frame"] == pos)[0][0] # TODO: (After assignment) add help interface, automatic graph scaling, etc try: while True:
'/..' '/..') table = read_html(table.get_attribute("innerHTML"))[0] for i in range(3): for j in range(5): backyearbutton.click() table = driver.find_element_by_xpath('//table[@class="maintable"]' '/tbody[@id="wr_webs_report"]' '/..' '/..') table = read_html(table.get_attribute("innerHTML"))[0] muteswantable = pd.concat([table, table[table.columns[2:7]]], axis=1) cols = muteswantable.columns.tolist() cols = [cols[0]] + cols[12:] + cols[2:7] + cols[8:11] muteswantable = muteswantable[cols] muteswantable = np.object_(muteswantable) for row in muteswantable: muteswanwriter.writerow(row) if muteswanpage == totalmuteswanpages: muteswanwriter.writerow(["END"] * 12) break else: nextpagebutton.click() muteswanpage += 1 # winsound.Beep(2500, 1000) muteswanfile.close() print(" - Data table extracted.\n\n", muteswantable, "\n") # Finds the location dropdown menu and clicks it
maskname="at sites with WWT centres") showsave("WWT site " + goosenames[k]) for k in range(len(ducknames)): plotpop(ducknames[k], duckpops[k], ducklocs[k], selectsites(ducklocs[k], WWTsitenames), maskname="at sites with WWT centres") showsave("WWT site " + ducknames[k]) # Plot the population of all swan species at each site with a WWT centre if "WWTcombinedpop" in whichplots: for sitename in WWTsitenames: plotpop("Swan", np.object_([ swanpops[k][swanlocs[k] == sitename] for k in range(len(swannames)) ]), swannames, maskname="at " + sitename) showsave("Swan Species at " + sitename) plotpop("Goose", np.object_([ goosepops[k][gooselocs[k] == sitename] for k in range(len(goosenames)) ]), goosenames, maskname="at " + sitename) showsave("Goose Species at " + sitename) plotpop("Duck", np.object_([ duckpops[k][ducklocs[k] == sitename]
def assert_equal_matlab_format(a, b, options=None): # Compares a and b for equality. b is always the original. If they # are dictionaries, a must be a structured ndarray and they must # have the same set of keys, after which they values must all be # compared. If they are a collection type (list, tuple, set, # frozenset, or deque), then the compairison must be made with b # converted to an object array. If the original is not a numpy type # (isn't or doesn't inherit from np.generic or np.ndarray), then it # is a matter of converting it to the appropriate numpy # type. Otherwise, both are supposed to be numpy types. For object # arrays, each element must be iterated over to be compared. Then, # if it isn't a string type, then they must have the same dtype, # shape, and all elements. All strings are converted to numpy.str_ # on read unless they were stored as a numpy.bytes_ due to having # non-ASCII characters. If it is empty, it has shape (1, 0). A # numpy.str_ has all of its strings per row compacted together. A # numpy.bytes_ string has to have the same thing done, but then it # needs to be converted up to UTF-32 and to numpy.str_ through # uint32. Big longs and ints end up getting converted to UTF-16 # uint16's when written and read back as UTF-32 numpy.unicode_. # # In all cases, we expect things to be at least two dimensional # arrays. if type(b) == dict or (sys.hexversion >= 0x2070000 and type(b) == collections.OrderedDict): assert_equal_nose(type(a), np.ndarray) assert a.dtype.names is not None # Determine if any of the keys could not be stored as str. If # they all can be, then the dtype field names should be the # keys. Otherwise, they should be 'keys' and 'values'. all_str_keys = True if sys.hexversion >= 0x03000000: tp_str = str tp_bytes = bytes converters = { tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: str(x) } tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x) else: tp_str = unicode tp_bytes = str converters = { tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: unicode(x) } tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x).encode('UTF-8') tps = tuple(converters.keys()) for k in b.keys(): if type(k) not in tps: all_str_keys = False break try: k_str = tp_conv(k) except: all_str_keys = False break if all_str_keys: assert_equal_nose(set(a.dtype.names), set([tp_conv_str(k) for k in b.keys()])) for k in b: assert_equal_matlab_format(a[tp_conv_str(k)][0], b[k], options) else: names = (options.dict_like_keys_name, options.dict_like_values_name) assert_equal_nose(set(a.dtype.names), set(names)) keys = a[names[0]][0] values = a[names[1]][0] assert_equal_matlab_format(keys, tuple(b.keys()), options) assert_equal_matlab_format(values, tuple(b.values()), options) elif type(b) in (list, tuple, set, frozenset, collections.deque): assert_equal_matlab_format(a, np.object_(list(b)), options) elif not isinstance(b, (np.generic, np.ndarray)): if b is None: # It should be np.zeros(shape=(0, 1), dtype='float64')) assert_equal_nose(type(a), np.ndarray) assert_equal_nose(a.dtype, np.dtype('float64')) assert_equal_nose(a.shape, (1, 0)) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, (bytes, str, bytearray))) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, (bytes, unicode, bytearray))): if len(b) == 0: assert_equal(a, np.zeros(shape=(1, 0), dtype='U'), options) elif isinstance(b, (bytes, bytearray)): try: c = np.unicode_(b.decode('ASCII')) except: c = np.bytes_(b) assert_equal(a, np.atleast_2d(c), options) else: assert_equal(a, np.atleast_2d(np.unicode_(b)), options) elif (sys.hexversion >= 0x03000000 \ and type(b) == int) \ or (sys.hexversion < 0x03000000 \ and type(b) == long): if b > 2**63 or b < -(2**63 - 1): assert_equal(a, np.atleast_2d(np.unicode_(b)), options) else: assert_equal(a, np.atleast_2d(np.int64(b)), options) else: assert_equal(a, np.atleast_2d(np.array(b)), options) else: if b.dtype.name != 'object': if b.dtype.char in ('U', 'S'): if len(b) == 0 and (b.shape == tuple() \ or b.shape == (0, )): assert_equal(a, np.zeros(shape=(1, 0), dtype='U'), options) elif b.dtype.char == 'U': c = np.atleast_1d(b) c = np.atleast_2d(c.view(np.dtype('U' \ + str(c.shape[-1]*c.dtype.itemsize//4)))) assert_equal_nose(a.dtype, c.dtype) assert_equal_nose(a.shape, c.shape) npt.assert_equal(a, c) elif b.dtype.char == 'S': c = np.atleast_1d(b).view(np.ndarray) if np.all(c.view(np.uint8) < 128): c = c.view(np.dtype('S' \ + str(c.shape[-1]*c.dtype.itemsize))) c = c.view(np.dtype('uint8')) c = np.uint32(c.view(np.dtype('uint8'))) c = c.view(np.dtype('U' + str(c.shape[-1]))) c = np.atleast_2d(c) assert_equal_nose(a.dtype, c.dtype) assert_equal_nose(a.shape, c.shape) npt.assert_equal(a, c) pass else: c = np.atleast_2d(b) assert_equal_nose(a.dtype, c.dtype) assert_equal_nose(a.shape, c.shape) with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) npt.assert_equal(a, c) else: c = np.atleast_2d(b) # An empty complex number gets turned into a real # number when it is stored. if np.prod(c.shape) == 0 \ and b.dtype.name.startswith('complex'): c = np.real(c) # If it is structured, check that the field names are # the same, in the same order, and then go through them # one by one. Otherwise, make sure the dtypes and shapes # are the same before comparing all values. if b.dtype.names is None and a.dtype.names is None: assert_equal_nose(a.dtype, c.dtype) assert_equal_nose(a.shape, c.shape) with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) npt.assert_equal(a, c) else: assert a.dtype.names is not None assert b.dtype.names is not None assert_equal_nose(set(a.dtype.names), set(b.dtype.names)) # The ordering of fields must be preserved if the # MATLAB_fields attribute could be used, which can # only be done if there are no non-ascii characters # in any of the field names. if sys.hexversion >= 0x03000000: allfields = ''.join(b.dtype.names) else: allfields = unicode('').join( \ [nm.decode('UTF-8') \ for nm in b.dtype.names]) if np.all(np.array([ord(ch) < 128 \ for ch in allfields])): assert_equal_nose(a.dtype.names, b.dtype.names) a = a.flatten() b = b.flatten() for k in b.dtype.names: for index, x in np.ndenumerate(a): assert_equal_from_matlab(a[k][index], b[k][index], options) else: c = np.atleast_2d(b) assert_equal_nose(a.dtype, c.dtype) assert_equal_nose(a.shape, c.shape) for index, x in np.ndenumerate(a): assert_equal_matlab_format(a[index], c[index], options)
o["data-reg"], o["data-migratoriness"], o["data-ranginess"]] for o in iddatasoup.find_all("option")[1:]] print(" - - Writing site names to 'birdnames.csv'...") # Save these to the file for later use birdnameswriter = csv.writer(birdiddatafile) birdnameswriter.writerow(["Name", "ID", "Taxon", "IsSummerMigrant", "MigrantStatus", "Range"]) for row in birdiddata: print(row) birdnameswriter.writerow(row) else: # Otherwise read the names from a previously saved file birdiddatafile = open("birdnames.csv", "r", encoding="utf-8", newline="") birdiddata = [row for row in csv.reader(birdiddatafile)] # Always sure to close the file afterwards (Although python would probably clean up otherwise anyway) birdiddatafile.close() birdiddata = np.object_(birdiddata[1:]) print(birdiddata) birdnames = birdiddata[:, 0] print(" - - Bird names etc ready.\n\n", birdnames, "\n - Bird names etc extracted.") while True: # Take input of regex to match a set of bird names whose tables to download birdname = input("\nEnter bird name or regex pattern, case insensitive\n>>> ") print("Finding bird name matches...") matcher = re.compile(birdname, re.IGNORECASE) chosenbirdnames = list(filter(matcher.search, birdnames)) if len(chosenbirdnames) == 0: print(" - No bird name matches found.") # Simply go back and ask again if none match the request continue print(" - Bird name matches found ({}).\n\n".format(len(chosenbirdnames)), "\n".join(chosenbirdnames), sep="")
def assert_equal_none_format(a, b, options=None): # Compares a and b for equality. b is always the original. If they # are dictionaries, a must be a structured ndarray and they must # have the same set of keys, after which they values must all be # compared. If they are a collection type (list, tuple, set, # frozenset, or deque), then the compairison must be made with b # converted to an object array. If the original is not a numpy type # (isn't or doesn't inherit from np.generic or np.ndarray), then it # is a matter of converting it to the appropriate numpy # type. Otherwise, both are supposed to be numpy types. For object # arrays, each element must be iterated over to be compared. Then, # if it isn't a string type, then they must have the same dtype, # shape, and all elements. If it is an empty string, then it would # have been stored as just a null byte (recurse to do that # comparison). If it is a bytes_ type, the dtype, shape, and # elements must all be the same. If it is string_ type, we must # convert to uint32 and then everything can be compared. Big longs # and ints get written as numpy.bytes_. if type(b) == dict or (sys.hexversion >= 0x2070000 and type(b) == collections.OrderedDict): assert_equal_nose(type(a), np.ndarray) assert a.dtype.names is not None # Determine if any of the keys could not be stored as str. If # they all can be, then the dtype field names should be the # keys. Otherwise, they should be 'keys' and 'values'. all_str_keys = True if sys.hexversion >= 0x03000000: tp_str = str tp_bytes = bytes converters = { tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: str(x) } tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x) else: tp_str = unicode tp_bytes = str converters = { tp_str: lambda x: x, tp_bytes: lambda x: x.decode('UTF-8'), np.bytes_: lambda x: bytes(x).decode('UTF-8'), np.unicode_: lambda x: unicode(x) } tp_conv = lambda x: converters[type(x)](x) tp_conv_str = lambda x: tp_conv(x).encode('UTF-8') tps = tuple(converters.keys()) for k in b.keys(): if type(k) not in tps: all_str_keys = False break try: k_str = tp_conv(k) except: all_str_keys = False break if all_str_keys: assert_equal_nose(set(a.dtype.names), set([tp_conv_str(k) for k in b.keys()])) for k in b: assert_equal_none_format(a[tp_conv_str(k)][0], b[k], options) else: names = (options.dict_like_keys_name, options.dict_like_values_name) assert set(a.dtype.names) == set(names) keys = a[names[0]] values = a[names[1]] assert_equal_none_format(keys, tuple(b.keys()), options) assert_equal_none_format(values, tuple(b.values()), options) elif type(b) in (list, tuple, set, frozenset, collections.deque): assert_equal_none_format(a, np.object_(list(b)), options) elif not isinstance(b, (np.generic, np.ndarray)): if b is None: # It should be np.float64([]) assert_equal_nose(type(a), np.ndarray) assert_equal_nose(a.dtype, np.float64([]).dtype) assert_equal_nose(a.shape, (0, )) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, (bytes, bytearray))) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, (bytes, bytearray))): assert_equal_nose(a, np.bytes_(b)) elif (sys.hexversion >= 0x03000000 \ and isinstance(b, str)) \ or (sys.hexversion < 0x03000000 \ and isinstance(b, unicode)): assert_equal_none_format(a, np.unicode_(b), options) elif (sys.hexversion >= 0x03000000 \ and type(b) == int) \ or (sys.hexversion < 0x03000000 \ and type(b) == long): if b > 2**63 or b < -(2**63 - 1): assert_equal_none_format(a, np.bytes_(b), options) else: assert_equal_none_format(a, np.int64(b), options) else: assert_equal_none_format(a, np.array(b)[()], options) else: if b.dtype.name != 'object': if b.dtype.char in ('U', 'S'): if b.dtype.char == 'S' and b.shape == tuple() \ and len(b) == 0: assert_equal(a, \ np.zeros(shape=tuple(), dtype=b.dtype.char), \ options) elif b.dtype.char == 'U': if b.shape == tuple() and len(b) == 0: c = np.uint32(()) else: c = np.atleast_1d(b).view(np.uint32) assert_equal_nose(a.dtype, c.dtype) assert_equal_nose(a.shape, c.shape) npt.assert_equal(a, c) else: assert_equal_nose(a.dtype, b.dtype) assert_equal_nose(a.shape, b.shape) npt.assert_equal(a, b) else: # Now, if b.shape is just all ones, then a.shape will # just be (1,). Otherwise, we need to compare the shapes # directly. Also, dimensions need to be squeezed before # comparison in this case. assert_equal_nose(np.prod(a.shape), np.prod(b.shape)) assert a.shape == b.shape \ or (np.prod(b.shape) == 1 and a.shape == (1,)) if np.prod(a.shape) == 1: a = np.squeeze(a) b = np.squeeze(b) # If there was a null in the dtype, then it was written # as a Group so the field order could have changed. if '\\x00' in str(b.dtype): assert_equal_nose(set(a.dtype.descr), set(b.dtype.descr)) # Reorder the fields of a. c = np.empty(shape=b.shape, dtype=b.dtype) for n in b.dtype.names: c[n] = a[n] else: c = a assert_equal_nose(c.dtype, b.dtype) with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) npt.assert_equal(c, b) else: assert_equal_nose(a.dtype, b.dtype) assert_equal_nose(a.shape, b.shape) for index, x in np.ndenumerate(a): assert_equal_none_format(a[index], b[index], options)
def train(fpath): df = pd.read_csv(fpath) df = df.drop(['DateTime'], axis=1) df.SubId = np.object_(np.int64(df.SubId)) df.UserId = np.object_(df.UserId) df.Rating = np.int64(df.Rating) # remove users with less than 50 ratings temp = df.UserId.value_counts()[df.UserId.value_counts() < 50].index temp = set(temp) remain = [] for i in df.index: if df.UserId[i] not in temp: remain.append(i) df = df.loc[remain] # remove items with less than 50 ratings temp = df.SubId.value_counts()[df.SubId.value_counts() < 50].index temp = set(temp) remain = [] for i in df.index: if df.SubId[i] not in temp: remain.append(i) df = df.loc[remain] sf = gl.SFrame(df) print 'finished reading in data' dataset, test = gl.recommender.util.random_split_by_user( sf, user_id='UserId', item_id='SubId', item_test_proportion=0.2, random_seed=2345) training, validate = gl.recommender.util.random_split_by_user( dataset, user_id='UserId', item_id='SubId', item_test_proportion=0.25, random_seed=3456) numf = [2**e for e in range(3, 8)] regl = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3] res = {} min_rmse = 99999.0 coor_min_rmse = (numf[0], regl[0]) for j in numf: for i in regl: rcmder = gl.recommender.factorization_recommender.create( training, user_id='UserId', item_id='SubId', target='Rating', regularization=i, num_factors=j) res[(j, i)] = rcmder.evaluate(validate, metric='rmse', target='Rating')['rmse_overall'] if res[(j, i)] < min_rmse: min_rmse = res[(j, i)] coor_min_rmse = (j, i) print res print 'best combination is {} with RMSE {}'.format(coor_min_rmse, min_rmse) rcmder = gl.recommender.factorization_recommender.create( dataset, user_id='UserId', item_id='SubId', target='Rating', regularization=coor_min_rmse[1], num_factors=coor_min_rmse[0]) print 'finished training model' print rcmder.evaluate(test, metric='rmse', target='Rating') return rcmder
np.timedelta64(np.iinfo(np.int64).min + 1, "ms"), np.timedelta64(42, "us"), np.timedelta64(np.iinfo(np.int64).max, "us"), np.timedelta64(np.iinfo(np.int64).min + 1, "us"), np.timedelta64(42, "ns"), np.timedelta64(np.iinfo(np.int64).max, "ns"), np.timedelta64(np.iinfo(np.int64).min + 1, "ns"), "", "one", "1", True, False, np.bool_(True), np.bool_(False), np.str_("asdf"), np.object_("asdf"), ] DECIMAL_VALUES = [ Decimal("100"), Decimal("0.0042"), Decimal("1.0042"), ] @pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) def test_scalar_host_initialization(value): s = cudf.Scalar(value) np.testing.assert_equal(s.value, value) assert s.is_valid() is True
def train(fpath): df = pd.read_csv(fpath) df = df.drop(['DateTime'], axis=1) df.SubId = np.object_(np.int64(df.SubId)) df.UserId = np.object_(df.UserId) df.Rating = np.int64(df.Rating) # remove users with less than 50 ratings temp = df.UserId.value_counts()[df.UserId.value_counts() < 50].index temp = set(temp) remain = [] for i in df.index: if df.UserId[i] not in temp: remain.append(i) df = df.loc[remain] # remove items with less than 50 ratings temp = df.SubId.value_counts()[df.SubId.value_counts() < 50].index temp = set(temp) remain = [] for i in df.index: if df.SubId[i] not in temp: remain.append(i) df = df.loc[remain] sf = gl.SFrame(df) print 'finished reading in data' dataset, test = gl.recommender.util.random_split_by_user(sf, user_id='UserId', item_id='SubId', item_test_proportion=0.2, random_seed=2345) training, validate = gl.recommender.util.random_split_by_user(dataset, user_id='UserId', item_id='SubId', item_test_proportion=0.25, random_seed=3456) numf = [2 ** e for e in range(3, 8)] regl = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3] res = {} min_rmse = 99999.0 coor_min_rmse = (numf[0], regl[0]) for j in numf: for i in regl: rcmder = gl.recommender.factorization_recommender.create(training, user_id='UserId', item_id='SubId', target='Rating', regularization=i, num_factors=j) res[(j, i)] = rcmder.evaluate(validate, metric='rmse', target='Rating')['rmse_overall'] if res[(j, i)] < min_rmse: min_rmse = res[(j, i)] coor_min_rmse = (j, i) print res print 'best combination is {} with RMSE {}'.format(coor_min_rmse, min_rmse) rcmder = gl.recommender.factorization_recommender.create(dataset, user_id='UserId', item_id='SubId', target='Rating', regularization=coor_min_rmse[1], num_factors=coor_min_rmse[0]) print 'finished training model' print rcmder.evaluate(test, metric='rmse', target='Rating') return rcmder