示例#1
0
         check.'''
        for tag in self.out_tags:
            if (self.fs.exists(tag)):
                raise ValueError('tag %s already exists' % (tag))

    def run(self, **kw):
        if ('required_modules' in kw):
            raise ValueError(
                'required_modules is not supported by this subclass')
        kw['required_modules'] = [('mr_path',
                                   os.path.abspath(mr_path.__file__))]
        super(Job, self).run(**kw)


# Some mixins for various handy behavior.


class TSV_Reader_Job(object):
    @staticmethod
    def map_reader(fp, size, url, params):
        # Note: I can't find this in the docs or the source, but fp is
        # apparently a regular old open file, at least on my one-node tests. I
        # suspect this may be different when nodes start to send each other data
        # over HTTP, but we'll cross that bridge when we come to it.
        fp_unicode = io.open(fp.fileno(), encoding='utf8')
        for line in fp_unicode:
            yield line.split('\t')


testable.manualonly_register('')
示例#2
0
文件: mr_base.py 项目: aronwc/quac
      '''Raise ValueError if any of the output tags already exist. Note that
         there's a race condition here, so it's nothing more than a sanity
         check.'''
      for tag in self.out_tags:
         if (self.fs.exists(tag)):
            raise ValueError('tag %s already exists' % (tag))

   def run(self, **kw):
      if ('required_modules' in kw):
         raise ValueError('required_modules is not supported by this subclass')
      kw['required_modules'] = [('mr_path', os.path.abspath(mr_path.__file__))]
      super(Job, self).run(**kw)


# Some mixins for various handy behavior.

class TSV_Reader_Job(object):

   @staticmethod
   def map_reader(fp, size, url, params):
      # Note: I can't find this in the docs or the source, but fp is
      # apparently a regular old open file, at least on my one-node tests. I
      # suspect this may be different when nodes start to send each other data
      # over HTTP, but we'll cross that bridge when we come to it.
      fp_unicode = io.open(fp.fileno(), encoding='utf8')
      for line in fp_unicode:
         yield line.split('\t')


testable.manualonly_register('')
示例#3
0
class Tiny_ICU(base.Tzer):
    u'''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese.
       Ignores everything else. E.g.:

       >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS
       True
       >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
       True'''
    def __init__(self, ngram):
        base.Tzer.__init__(self, ngram)
        self.tiny = tiny.Tzer(ngram)
        self.icu = ICU(ngram)

    def tokenize_real(self, text):
        ws_tokens = text.split()
        tokens = []
        for ws_token in ws_tokens:
            if (is_latin(ws_token)):
                tokens.extend(self.icu.tokenize(ws_token))
            elif (is_japanese(ws_token)):
                tokens.extend(self.tiny.tokenize(ws_token))
        return tokens


testable.manualonly_register(u'''

>>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
True

''')
示例#4
0
   u'''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese.
       Ignores everything else. E.g.:

       >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS
       True
       >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
       True'''

   def __init__(self, ngram):
      base.Tzer.__init__(self, ngram)
      self.tiny = tiny.Tzer(ngram)
      self.icu = ICU(ngram)

   def tokenize_real(self, text):
      ws_tokens = text.split()
      tokens = []
      for ws_token in ws_tokens:
         if (is_latin(ws_token)):
            tokens.extend(self.icu.tokenize(ws_token))
         elif (is_japanese(ws_token)):
            tokens.extend(self.tiny.tokenize(ws_token))
      return tokens


testable.manualonly_register(u'''

>>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
True

''')
示例#5
0
文件: optimize.py 项目: bussiere/quac
                 for gmm in self.all_gmms])
      if self.verbose:
         for (fv,fi) in self.feature_alphabet.items():
            l.debug('feature weight %s=%g' % (fv,res.x[fi]))
         for (t,w) in di.items():
            l.debug('token weight %s=%s'%(t,str(w)))
      # clean up
      for g in self.all_gmms:
         g.feature_vector = None
      return di

# test that self.all_gmms has stable order
# disabled for now (see issue #100)
testable.manualonly_register('''
>>> import random
>>> from . import gmm
>>> def test_random():
...   u.rand = random.Random(123)
...   gmm.Token.parms_init({})
...   mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326)
...   m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a')
...   m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b')
...   m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c')
...   m = Weight([[m1, m2], [m2, m3], [m1, m3]],
...            [[100, 50], [50, 200], [80, 400]], identity_feature=True,
...            misc_feature=False)
...   return list(m.all_gmms)
>>> all((test_random()[0].tokens == test_random()[0].tokens for i in range(100)))
True
''')
示例#6
0
文件: optimize.py 项目: subgiant/quac
        if self.verbose:
            for (fv, fi) in self.feature_alphabet.items():
                l.debug('feature weight %s=%g' % (fv, res.x[fi]))
            for (t, w) in di.items():
                l.debug('token weight %s=%s' % (t, str(w)))
        # clean up
        for g in self.all_gmms:
            g.feature_vector = None
        return di


# test that self.all_gmms has stable order
# disabled for now (see issue #100)
testable.manualonly_register('''
>>> import random
>>> from . import gmm
>>> def test_random():
...   u.rand = random.Random(123)
...   gmm.Token.parms_init({})
...   mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326)
...   m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a')
...   m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b')
...   m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c')
...   m = Weight([[m1, m2], [m2, m3], [m1, m3]],
...            [[100, 50], [50, 200], [80, 400]], identity_feature=True,
...            misc_feature=False)
...   return list(m.all_gmms)
>>> all((test_random()[0].tokens == test_random()[0].tokens for i in range(100)))
True
''')