'''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese. Ignores everything else. E.g.: >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS True >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True''' def __init__(self, ngram): base.Tzer.__init__(self, ngram) self.tiny = tiny.Tzer(ngram) self.icu = ICU(ngram) def tokenize_real(self, text): ws_tokens = text.split() tokens = [] for ws_token in ws_tokens: if (is_latin(ws_token)): tokens.extend(self.icu.tokenize(ws_token)) elif (is_japanese(ws_token)): tokens.extend(self.tiny.tokenize(ws_token)) return tokens # Test-Depends: manual icu testable.register(''' >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True ''')
def table_exists_p(self, table): return (1 == len( self.sql( """SELECT 1 FROM sqlite_master WHERE name = ?""", (table, )))) def table_ct(self): 'Return the number of tables in the database.' return self.sql("SELECT count(*) FROM sqlite_master")[0][0] testable.register(''' # FIXME: the kludge to silence SpatiaLite fails with AttributeError when # run under doctest. Therefore, we test without SpatiaLite for now. # Initialize an in-memory database >>> db = DB(':memory:', create=True, spatialite=False) >>> db.is_empty() True >>> db.create_table('foo', { 'a': 'int' }) # Does table_ct() work? >>> db.table_ct() 1 >>> db.is_empty() False ''')
for gmm in self.all_gmms]) if self.verbose: for (fv, fi) in self.feature_alphabet.iteritems(): l.debug('feature weight %s=%g' % (fv, res.x[fi])) for (t, w) in di.iteritems(): l.debug('token weight %s=%s' % (t, str(w))) # clean up for g in self.all_gmms: g.feature_vector = None return di # test that self.all_gmms has stable order testable.register(''' >>> import gmm >>> import random >>> def test_random(): ... u.rand = random.Random(123) ... gmm.Token.parms_init({}) ... mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326) ... m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a') ... m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b') ... m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c') ... m = Weight([[m1, m2], [m2, m3], [m1, m3]], ... [[100, 50], [50, 200], [80, 400]], identity_feature=True, ... misc_feature=False) ... return list(m.all_gmms) >>> all((test_random()[0].tokens == test_random()[0].tokens for i in xrange(100))) True ''')
'tweet_id': -1, 'created_at': datetime.now(), 'text': 'a b', 'user_screen_name': 'c', 'user_description': 'd', 'user_lang': 'e', 'user_location': 'f', 'user_time_zone': 'g', 'geom': None, 'geom_src': None }) T_TW_JSON_CO = r'''{"text":"Guantes, bufanda, tenis y chamarra :) #Viena","id_str":"186339941163339776","contributors":null,"in_reply_to_status_id_str":null,"geo":{"type":"Point","coordinates":[48.24424304,16.37778864]},"retweet_count":0,"in_reply_to_status_id":null,"favorited":false,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twitter.com\/#!\/download\/iphone\" rel=\"nofollow\"\u003ETwitter for iPhone\u003C\/a\u003E","created_at":"Sun Apr 01 06:31:18 +0000 2012","in_reply_to_user_id_str":null,"truncated":false,"entities":{"urls":[],"hashtags":[{"text":"Viena","indices":[38,44]}],"user_mentions":[]},"coordinates":{"type":"Point","coordinates":[16.37778864,48.24424304]},"place":{"country":"Austria","place_type":"city","url":"http:\/\/api.twitter.com\/1\/geo\/id\/9f659d51e5c5deae.json","country_code":"AT","bounding_box":{"type":"Polygon","coordinates":[[[16.182302,48.117666],[16.577511,48.117666],[16.577511,48.322574],[16.182302,48.322574]]]},"attributes":{},"full_name":"Vienna, Vienna","name":"Vienna","id":"9f659d51e5c5deae"},"in_reply_to_screen_name":null,"user":{"profile_background_color":"8B542B","id_str":"249409866","profile_background_tile":true,"screen_name":"montse_moso","listed_count":3,"time_zone":"Mexico City","profile_sidebar_fill_color":"ffffff","description":"you It's exhausting being this Juicy \u2764","default_profile":false,"profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","created_at":"Wed Feb 09 00:21:15 +0000 2011","profile_sidebar_border_color":"f03368","is_translator":false,"contributors_enabled":false,"geo_enabled":true,"url":null,"profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2003516916\/image_normal.jpg","follow_request_sent":null,"profile_use_background_image":true,"lang":"es","verified":false,"profile_text_color":"333333","protected":false,"default_profile_image":false,"show_all_inline_media":false,"notifications":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","location":"","name":"Montse Alcaraz ","favourites_count":415,"profile_link_color":"9D582E","id":249409866,"statuses_count":5252,"following":null,"utc_offset":-21600,"friends_count":368,"followers_count":191,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2003516916\/image_normal.jpg"},"retweeted":false,"id":186339941163339776}''' # FIXME: add test tweets for the other geotag sources testable.register(''' # Make sure we don't drop anything through all the parsing and unparsing. >>> a = from_json(T_TW_JSON_CO) >>> a.geom_src 'co' >>> a.created_at datetime.datetime(2012, 4, 1, 6, 31, 18, tzinfo=<UTC>) >>> a.day '2012-04-01' >>> a == Tweet.from_list(a.to_list()) True >>> a == Tweet.from_dict(a.to_dict()) True ''')
testable.register(''' # Make sure random seed is set to a known value >>> rand.random() 0.40224696110279223 # Memoized function fails with TypeError if passed an unhashable argument. >>> @memoize ... def f(x): ... return x*2 >>> f(dict()) Traceback (most recent call last): ... TypeError: unhashable type: 'dict' # Check that memoized reset() works by looking at exposed cache. >>> f(1) 2 >>> f.cache {(1,): 2} >>> f.reset() >>> f.cache {} # More slices. Basically, we want (almost) the same behavior as if we had # typed the slice into the Python interpreter. The "and None" trick is simply # to suppress output if the expression is true, so we don't have to keep # typing "True". >>> a = [0, 1, 2, 3, 4] >>> (a[slp(':')] == a) and None >>> (a[slp('0')] == [a[0]]) and None >>> (a[slp('4')] == [a[4]]) and None >>> a[slp('5')] [] >>> (a[slp('-1')] == [a[-1]]) and None >>> (a[slp('-2')] == [a[-2]]) and None >>> (a[slp('-5')] == [a[-5]]) and None >>> a[slp('-6')] [] >>> (a[slp('1:')] == a[1:]) and None >>> (a[slp(':1')] == a[:1]) and None >>> (a[slp('-2:')] == a[-2:]) and None >>> (a[slp(':-2')] == a[:-2]) and None >>> (a[slp('1::')] == a[1::]) and None >>> (a[slp('::1')] == a[::1]) and None >>> (a[slp('2::')] == a[2::]) and None >>> (a[slp('::2')] == a[::2]) and None >>> (a[slp('-1::')] == a[-1::]) and None >>> (a[slp('::-1')] == a[::-1]) and None # More unioned slices >>> pprint(sl_union(10)) # no slices set() >>> pprint(sl_union(0, slp('1'))) # empty list set() >>> pprint(sl_union(10, slp('1:4'))) # one slice set([1, 2, 3]) >>> pprint(sl_union(10, slp('1:4'), slp('3'))) # overlapping slices set([1, 2, 3]) >>> pprint(sl_union(10, slp('10'))) # fully out of bounds set() >>> pprint(sl_union(10, slp('9:11'))) # partly out of bounds set([9]) >>> pprint(sl_union(10, slp('9'), slp('10'))) # one in, one out set([9]) ''')
max(self.min_value, gmm.score)) for gmm in self.all_gmms]) if self.verbose: for (fv,fi) in self.feature_alphabet.iteritems(): l.debug('feature weight %s=%g' % (fv,res.x[fi])) for (t,w) in di.iteritems(): l.debug('token weight %s=%s'%(t,str(w))) # clean up for g in self.all_gmms: g.feature_vector = None return di # test that self.all_gmms has stable order testable.register(''' >>> import gmm >>> import random >>> def test_random(): ... u.rand = random.Random(123) ... gmm.Token.parms_init({}) ... mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326) ... m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a') ... m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b') ... m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c') ... m = Weight([[m1, m2], [m2, m3], [m1, m3]], ... [[100, 50], [50, 200], [80, 400]], identity_feature=True, ... misc_feature=False) ... return list(m.all_gmms) >>> all((test_random()[0].tokens == test_random()[0].tokens for i in xrange(100))) True ''')
testable.register(''' # Make sure the SRIDs we're interested in are available. >>> for srid in (4326, 54003, 540033, 540036, 54009, 540093, 540096): ... if not isinstance(SRS[srid], gdal.SpatialReference): srid # Test that we can transform to and from the custom SRSes. >>> a = geos.Point(1, 2, srid=SRID_WGS84) >>> b = transform(a, 540036) >>> a.srid 4326 >>> b.coords (0.111..., 0.220...) >>> b.srid 540036 >>> c = transform(b, 4326) >>> c.srid 4326 >>> [round(x, 4) for x in c.coords] [1.0, 2.0] # geodesic_area() should except if we give it a bogus geometry type. >>> geodesic_area(geos.Point(0,0)) Traceback (most recent call last): ... TypeError: need Polygon or MultiPolygon, not Point # inbounds_p() should work north/sound and on SRS that requires transform >>> inbounds_p(geos.Point(0, 89.98, srid=SRID_WGS84)) True >>> inbounds_p(geos.Point(0, 90.01, srid=SRID_WGS84)) False >>> inbounds_p(geos.Point(0, -89.98, srid=SRID_WGS84)) True >>> inbounds_p(geos.Point(0, -90.01, srid=SRID_WGS84)) False >>> inbounds_p(geos.Point(0, 14671436.0, srid=54003)) True >>> inbounds_p(geos.Point(0, 14671436.1, srid=54003)) False >>> inbounds_p(geos.Point(0, -14671436.0, srid=54003)) True >>> inbounds_p(geos.Point(0, -14671436.1, srid=54003)) False # Ensure that trim() works on multipolygons. >>> yo = 15e6 >>> yi = 14e6 >>> mp = geos.MultiPoint([geos.Point(0, yi), geos.Point(0, yo)], srid=54003) >>> trim(mp).coords (0.0, 14000000.0) ''')
class Token_All_Pipeline(pipeline.Model): def __init__(self, token_iterator): assert False, 'unimplemented' pipeline.Model.__init__(self, [Token(token_iterator), All_Tweets(token_iterator)]) ### Tests ### # Test passes as of sklearn.13-git testable.register(''' # Test that fitting respects consistent random state. >>> def test_r(): ... r = np.random.mtrand.RandomState(1234) ... m = sklearn.mixture.GMM(n_components=2, random_state=r) ... m.fit([1, 1.1, 2, 2.2]) ... return m.sample(10, r) >>> all((test_r().tolist() == test_r().tolist() for i in xrange(100))) True ''') def test_interactive(): import cProfile #prof = cProfile.Profile() #prof.enable() u.logging_init('inter', verbose_=True) test_error_metrics() test_interactive_real()
u.fmt_sparsearray(self.data)) def save(self, ignore=-1): self.total_update() if (self.total < ignore): return False if (self.total <= FRAGMENT_TOTAL_ZMAX): data = zlib.compress(self.data.data, ZLEVEL) else: data = self.data.data if (self.source == Fragment_Source.NEW): self.group.db.sql( """INSERT INTO data%d (name, dtype, total, data) VALUES (?, ?, ?, ?)""" % self.shard, (self.name, self.data.dtype.char, self.total, data)) else: self.group.db.sql( """UPDATE data%d SET dtype=?, total=?, data=? WHERE name=?""" % self.shard, (self.data.dtype.char, self.total, data, self.name)) return True def total_update(self): # np.sum() returns a NumPy data type, which confuses SQLite somehow. # Therefore, use a plain Python float. self.total = float(np.nansum(np.abs(self.data))) testable.register()
def __init__(self, token_iterator): assert False, 'unimplemented' pipeline.Model.__init__( self, [Token(token_iterator), All_Tweets(token_iterator)]) ### Tests ### # Test passes as of sklearn.13-git testable.register(''' # Test that fitting respects consistent random state. >>> def test_r(): ... r = np.random.mtrand.RandomState(1234) ... m = sklearn.mixture.GMM(n_components=2, random_state=r) ... m.fit([1, 1.1, 2, 2.2]) ... return m.sample(10, r) >>> all((test_r().tolist() == test_r().tolist() for i in xrange(100))) True ''') def test_interactive(): import cProfile #prof = cProfile.Profile() #prof.enable() u.logging_init('inter', verbose_=True) test_error_metrics() test_interactive_real()
testable.register(''' # Make sure random seed is set to a known value >>> rand.random() 0.40224696110279223 # Memoized function fails with TypeError if passed an unhashable argument. >>> @memoize ... def f(x): ... return x*2 >>> f(dict()) Traceback (most recent call last): ... TypeError: unhashable type: 'dict' # Check that memoized reset() works by looking at exposed cache. >>> f(1) 2 >>> f.cache {(1,): 2} >>> f.reset() >>> f.cache {} # More slices. Basically, we want (almost) the same behavior as if we had # typed the slice into the Python interpreter. The "and None" trick is simply # to suppress output if the expression is true, so we don't have to keep # typing "True". >>> a = [0, 1, 2, 3, 4] >>> (a[slp(':')] == a) and None >>> (a[slp('0')] == [a[0]]) and None >>> (a[slp('4')] == [a[4]]) and None >>> a[slp('5')] [] >>> (a[slp('-1')] == [a[-1]]) and None >>> (a[slp('-2')] == [a[-2]]) and None >>> (a[slp('-5')] == [a[-5]]) and None >>> a[slp('-6')] [] >>> (a[slp('1:')] == a[1:]) and None >>> (a[slp(':1')] == a[:1]) and None >>> (a[slp('-2:')] == a[-2:]) and None >>> (a[slp(':-2')] == a[:-2]) and None >>> (a[slp('1::')] == a[1::]) and None >>> (a[slp('::1')] == a[::1]) and None >>> (a[slp('2::')] == a[2::]) and None >>> (a[slp('::2')] == a[::2]) and None >>> (a[slp('-1::')] == a[-1::]) and None >>> (a[slp('::-1')] == a[::-1]) and None # More unioned slices >>> sl_union(10) # no slices set() >>> sl_union(0, slp('1')) # empty list set() >>> sorted(sl_union(10, slp('1:4'))) # one slice [1, 2, 3] >>> sorted(sl_union(10, slp('1:4'), slp('3'))) # overlapping slices [1, 2, 3] >>> sl_union(10, slp('10')) # fully out of bounds set() >>> sl_union(10, slp('9:11')) # partly out of bounds {9} >>> sl_union(10, slp('9'), slp('10')) # one in, one out {9} ''')
class TSV_Output_Job(Job): '''Mixin for TSV UTF-8 text output. :meth:`reduce_write()` expects a sequence of stringifiable objects.''' def reduce_open_output(self): assert False, 'unimplemented' def reduce_write(self, item): self.outfp.writerow(item) testable.register(r''' # Test data passing from mapper to reducer. >>> import io >>> buf = io.BytesIO() >>> job = Test_Job() >>> job.outfp = buf >>> for kv in [(1, -1), (2, -2), (2, -3), (3, -4), (3, -5), (3, -6)]: ... job.map_write(*kv) >>> buf.getvalue() b'1\tgASVBgAAAAAAAABK/////y4=\n2\tgASVBgAAAAAAAABK/v///y4=\n2\tgASVBgAAAAAAAABK/f///y4=\n3\tgASVBgAAAAAAAABK/P///y4=\n3\tgASVBgAAAAAAAABK+////y4=\n3\tgASVBgAAAAAAAABK+v///y4=\n' >>> buf.seek(0) 0 >>> job.infp = buf >>> [(k, list(v)) for (k, v) in job.reduce_inputs()] [('1', [-1]), ('2', [-2, -3]), ('3', [-4, -5, -6])] ''')
def close(self): if (self.writable and self.locked): u.lock_release(self.filename) def commit(self): 'Write data to disk.' assert (self.writable) fp = io.open(self.filename, mode='wb') pickle.dump(self.data, fp, pickle.HIGHEST_PROTOCOL) testable.register(''' >>> import os >>> import tempfile >>> testfile = tempfile.mktemp() >>> a = File(testfile, default=[1,2,3], writable=True) >>> a.data [1, 2, 3] >>> a.data.append(4) >>> a.data [1, 2, 3, 4] >>> a.commit() >>> del a >>> b = File(testfile) >>> b.data [1, 2, 3, 4] >>> os.unlink(testfile) ''')
testable.register(''' # test that Date_Vector objects can be pickled >>> import pickle >>> a = Date_Vector('2013-06-02', np.arange(2, 7)) >>> b = pickle.loads(pickle.dumps(a)) >>> np.array_equal(a, b) True >>> a.first_day == b.first_day True # make sure repr() objects really can be eval()'ed >>> b = eval(repr(a)) >>> np.array_equal(a, b) True >>> a.first_day == b.first_day True # do methods that should return scalars do so? >>> c = np.arange(2, 7) >>> c.sum() 20 >>> type(c.sum()) <class 'numpy.int64'> >>> a.sum() 20 >>> type(a.sum()) <class 'numpy.int64'> ''')
class LocalTimezone(datetime.tzinfo): def utcoffset(self, dt): if self._isdst(dt): return DSTOFFSET else: return STDOFFSET def dst(self, dt): if self._isdst(dt): return DSTDIFF else: return ZERO def tzname(self, dt): return time.tzname[self._isdst(dt)] def _isdst(self, dt): tt = (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.weekday(), 0, 0) stamp = time.mktime(tt) tt = time.localtime(stamp) return tt.tm_isdst > 0 local_tz = LocalTimezone() testable.register('')
def init(core_ct_): '''This is here because doctest is not able to set module globals without fooling around (this is by design). Perhaps in the future it will have a real purpose as well. You do not need to call it, as there are sensible defaults (in particular, core_ct = 1 -- you must ask for parallelism).''' assert (core_ct_ >= 1) global core_ct core_ct = core_ct_ testable.register(''' # Does require_multicore work? >>> init(1) >>> do(f_test, (1, 2), [(4, 8), (16, 32)]) [15, 51] >>> do(f_test, (1, 2), [(4, 8), (16, 32)], require_multicore=True) Traceback (most recent call last): ... ValueError: multicore forced, but core_ct == 1 >>> init(2) >>> do(f_test, (1, 2), [(4, 8), (16, 32)], require_multicore=True) [15, 51] # Don't crash if the length of every is less than core_ct >>> init(4) >>> do(f_test, (1, 2), [(1, 1), (2, 2), (3, 3)]) [5, 7, 9] ''')
T_TW_SIMPLE = Tweet.from_dict({ 'tweet_id': -1, 'created_at': datetime.now(), 'text': 'a b', 'user_screen_name': 'c', 'user_description': 'd', 'user_lang': 'e', 'user_location': 'f', 'user_time_zone': 'g', 'geom': None, 'geom_src': None }) T_TW_JSON_CO = r'''{"text":"Guantes, bufanda, tenis y chamarra :) #Viena","id_str":"186339941163339776","contributors":null,"in_reply_to_status_id_str":null,"geo":{"type":"Point","coordinates":[48.24424304,16.37778864]},"retweet_count":0,"in_reply_to_status_id":null,"favorited":false,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twitter.com\/#!\/download\/iphone\" rel=\"nofollow\"\u003ETwitter for iPhone\u003C\/a\u003E","created_at":"Sun Apr 01 06:31:18 +0000 2012","in_reply_to_user_id_str":null,"truncated":false,"entities":{"urls":[],"hashtags":[{"text":"Viena","indices":[38,44]}],"user_mentions":[]},"coordinates":{"type":"Point","coordinates":[16.37778864,48.24424304]},"place":{"country":"Austria","place_type":"city","url":"http:\/\/api.twitter.com\/1\/geo\/id\/9f659d51e5c5deae.json","country_code":"AT","bounding_box":{"type":"Polygon","coordinates":[[[16.182302,48.117666],[16.577511,48.117666],[16.577511,48.322574],[16.182302,48.322574]]]},"attributes":{},"full_name":"Vienna, Vienna","name":"Vienna","id":"9f659d51e5c5deae"},"in_reply_to_screen_name":null,"user":{"profile_background_color":"8B542B","id_str":"249409866","profile_background_tile":true,"screen_name":"montse_moso","listed_count":3,"time_zone":"Mexico City","profile_sidebar_fill_color":"ffffff","description":"you It's exhausting being this Juicy \u2764","default_profile":false,"profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","created_at":"Wed Feb 09 00:21:15 +0000 2011","profile_sidebar_border_color":"f03368","is_translator":false,"contributors_enabled":false,"geo_enabled":true,"url":null,"profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2003516916\/image_normal.jpg","follow_request_sent":null,"profile_use_background_image":true,"lang":"es","verified":false,"profile_text_color":"333333","protected":false,"default_profile_image":false,"show_all_inline_media":false,"notifications":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","location":"","name":"Montse Alcaraz ","favourites_count":415,"profile_link_color":"9D582E","id":249409866,"statuses_count":5252,"following":null,"utc_offset":-21600,"friends_count":368,"followers_count":191,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2003516916\/image_normal.jpg"},"retweeted":false,"id":186339941163339776}''' # FIXME: add test tweets for the other geotag sources testable.register(''' # Make sure we don't drop anything through all the parsing and unparsing. >>> a = from_json(T_TW_JSON_CO) >>> a.geom_src 'co' >>> a.created_at datetime.datetime(2012, 4, 1, 6, 31, 18, tzinfo=<UTC>) >>> a.day '2012-04-01' >>> a == Tweet.from_list(a.to_list()) True >>> a == Tweet.from_dict(a.to_dict()) True ''')
u'''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese. Ignores everything else. E.g.: >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS True >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True''' def __init__(self, ngram): base.Tzer.__init__(self, ngram) self.tiny = tiny.Tzer(ngram) self.icu = ICU(ngram) def tokenize_real(self, text): ws_tokens = text.split() tokens = [] for ws_token in ws_tokens: if (is_latin(ws_token)): tokens.extend(self.icu.tokenize(ws_token)) elif (is_japanese(ws_token)): tokens.extend(self.tiny.tokenize(ws_token)) return tokens testable.register(u''' >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True ''')
testable.register(u''' # FIXME: I haven't figured out how to print the actual Unicode characters in # order to test them in a natural way. For example, letting the doctest # "shell" print a Unicode string gets you a heavily encoded string full of # "\u79c1" escape sequences rather than the characters themselves (you can use # print to make an individual string work, but that doesn't help for # sequences). Hence all the tests against True rather than a list. # Tokenizers should return the empty sequence in some cases >>> Whitespace(1).tokenize(None) [] >>> Whitespace(1).tokenize('') [] # ngram <= 1 is an error >>> Whitespace(0).tokenize(None) Traceback (most recent call last): ... ValueError: ngram must be >= 1, but 0 given # Test ngrams >>> Whitespace(1).tokenize('a b c') ['a', 'b', 'c'] >>> Whitespace(2).tokenize('a b c') ['a', 'b', 'c', 'a b', 'b c'] >>> Whitespace(3).tokenize('a b c') ['a', 'b', 'c', 'a b', 'b c', 'a b c'] ''')
else: tokens.append(cand.lower()) return tokens testable.register( u""" >>> all([s in unicodedata2.script_data['names'] ... for s in UP_Tiny.DISCARD_SCRIPTS]) True >>> all([s in unicodedata2.script_data['names'] ... for s in UP_Tiny.JP_SCRIPTS]) True >>> UP_Tiny(1).tokenize(base.T_EN) == base.T_EN_TOKS True >>> UP_Tiny(1).tokenize(base.T_FR) == base.T_FR_TOKS True >>> UP_Tiny(1).tokenize(base.T_JP) == base.T_JP_TOKS True >>> (UP_Tiny(1).tokenize(base.T_JP + ' ' + base.T_FR) ... == base.T_JP_TOKS + base.T_FR_TOKS) True >>> UP_Tiny(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True >>> UP_Tiny(1).tokenize(base.T_WEIRD) == base.T_WEIRD_TOKS True """ )
import tinysegmenter from . import base import testable class Tzer(base.Tzer): '''A wrapper for the TinySegmenter tokenizer for Japanese. e.g.: >>> Tzer(1).tokenize(base.T_JP) == base.T_JP_TOKS True''' def __init__(self, ngram): base.Tzer.__init__(self, ngram) self.seg = tinysegmenter.TinySegmenter() def tokenize_real(self, text): return [i.lower() for i in self.seg.tokenize(text)] testable.register('')
4 5 dtype: float64 >>> (X_, y_) = trim_for_fit(X, y, minfinite=1) Traceback (most recent call last): ... Degenerate_Fit_Error: 1 rows left, min=2 >>> (X_, y_) = trim_for_fit(X, y, minfinite=1, minrows=1) >>> X_ a b c 4 5 5 5 >>> y_ 4 5 dtype: float64""" assert len(X) == len(y) y_keep = pd.notnull(y) X_keep = ((X != 0) & X.notnull()).astype(int).sum(axis=1) >= minfinite * len(X.columns) mask = y_keep & X_keep X = X.loc[mask] y = y.loc[mask] assert len(X) == len(y) if len(X) < minrows: raise Degenerate_Fit_Error("%d rows left, min=%d" % (len(X), minrows)) return (X, y) # Since this stuff is experimental, we don't make the standard test suite # depend on it. # # Test-Depends: manual testable.register()
if (key[0] in self.JP_SCRIPTS): tokens.extend(self.tiny.tokenize(cand)) else: tokens.append(cand.lower()) return tokens testable.register(u''' >>> all([s in unicodedata2.script_data['names'] ... for s in UP_Tiny.DISCARD_SCRIPTS]) True >>> all([s in unicodedata2.script_data['names'] ... for s in UP_Tiny.JP_SCRIPTS]) True >>> UP_Tiny(1).tokenize(base.T_EN) == base.T_EN_TOKS True >>> UP_Tiny(1).tokenize(base.T_FR) == base.T_FR_TOKS True >>> UP_Tiny(1).tokenize(base.T_JP) == base.T_JP_TOKS True >>> (UP_Tiny(1).tokenize(base.T_JP + ' ' + base.T_FR) ... == base.T_JP_TOKS + base.T_FR_TOKS) True >>> UP_Tiny(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True >>> UP_Tiny(1).tokenize(base.T_WEIRD) == base.T_WEIRD_TOKS True ''')
continue class TSV_Output_Job(Job): '''Mixin for TSV UTF-8 text output. :meth:`reduce_write()` expects a sequence of stringifiable objects.''' def reduce_open_output(self): assert False, 'unimplemented' def reduce_write(self, item): self.outfp.writerow(item) testable.register(r''' # Test data passing from mapper to reducer. >>> from cStringIO import StringIO >>> buf = StringIO() >>> job = Test_Job() >>> job.outfp = buf >>> for kv in [(1, -1), (2, -2), (2, -3), (3, -4), (3, -5), (3, -6)]: ... job.map_write(*kv) >>> buf.getvalue() '1\tgAJK/////y4=\n2\tgAJK/v///y4=\n2\tgAJK/f///y4=\n3\tgAJK/P///y4=\n3\tgAJK+////y4=\n3\tgAJK+v///y4=\n' >>> buf.seek(0) >>> job.infp = buf >>> [(k, list(v)) for (k, v) in job.reduce_inputs()] [(u'1', [-1]), (u'2', [-2, -3]), (u'3', [-4, -5, -6])] ''')