#!/usr/bin/env python # -*- coding:utf-8 -*- from __future__ import unicode_literals from noaho import NoAho # 多模式匹配 from collections import Counter, defaultdict trie = NoAho() trie.add('hehe') trie.add('py') trie.add('python') txt = """ 我是谁不重要,重要的是你要学会python, hehe我是谁不重要,重要的是你要学会python 小米科技有限公司 """ ''' c = defaultdict(int) words = [txt[k[0]:k[1]] for k in trie.findall_long(txt)] wc = Counter(words) for k in trie.findall_long(txt): word = txt[k[0]:k[1]] c[word] += 1 #print(k) print(txt[k[0]:k[1]]) for k, v in wc.items(): print k, v
class AhjoGeocoder(object): PLAN_UNIT_SHORT_MATCH = r'^(\d{3,5})/(\d+)(.*)$' PLAN_UNIT_LONG_MATCH = r'^0?91-(\d+)-(\d+)-(\d+)(.*)$' def __init__(self): self.logger = logging.getLogger(__name__) self.no_match_addresses = [] self.no_match_plans = [] self.no_match_plan_units = [] self.plan_map = {} self.plan_unit_map = {} self.property_map = {} self.street_tree = None self.matches = 0 def convert_from_gk25(self, north, east): pnt = Point(east, north, srid=GK25_SRID) pnt.transform(settings.PROJECTION_SRID) return pnt def geocode_address(self, text): if not self.street_tree: return {} STREET_SUFFIXES = ('katu', 'tie', 'kuja', 'polku', 'kaari', 'linja', 'raitti', 'rinne', 'penger', 'ranta', u'väylä') for sfx in STREET_SUFFIXES: m = re.search(r'([A-Z]\w+%s)\s+(\d+)' % sfx, text) if not m: continue street_name = m.groups()[0].lower() if street_name not in self.street_hash: print "Street name not found: %s" % street_name.encode('utf8') self.no_match_addresses.append('%s %s' % (m.groups()[0], m.groups()[1])) textl = text.lower() ret = [x for x in self.street_tree.findall_long(textl)] geometries = {} for street_match in ret: (start, end) = street_match[0:2] street_name = textl[start:end] # check for the address number m = re.match(r'\s*(\d+)', text[end:]) if not m: #print "\tno address: %s" % text[start:] continue num = int(m.groups()[0]) e_list = self.street_hash[street_name] for e in e_list: if num == e['num']: break if e['num_end'] and e['num'] < num <= e['num_end']: break else: self.logger.warning("No match found for '%s %d'" % (street_name, num)) s = '%s %d' % (e['street'], num) if not s in self.no_match_addresses: self.no_match_addresses.append(s) continue pnt = self.convert_from_gk25(e['coord_n'], e['coord_e']) geom = {'name': '%s %d' % (e['street'], num), 'geometry': pnt, 'type': 'address', 'text': text} geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom return geometries def geocode_plan(self, plan_id): plan = self.plan_map.get(plan_id) if not plan: if plan_id not in self.no_match_plans: self.logger.warning("No plan found for plan id %s" % plan_id) self.no_match_plans.append(plan_id) return return {'name': plan_id, 'geometry': plan['geometry'], 'type': 'plan'} def geocode_plan_unit(self, text, context): # If there are more than one '/' characters, it's not a plan unit m = re.match(self.PLAN_UNIT_SHORT_MATCH, text) if m: if text.count('/') > 1: return None block_id, unit_id, rest = m.groups() block_id = int(block_id) unit_id = int(unit_id) district_id = block_id // 1000 block_id %= 1000 # TODO: Code the logic to extract and use unit # ids from the rest of the match. # if rest: # if rest[0].lower() in ('a', 'b', 'c', 'd', 'e'): # rest = rest[1:] # rest = rest.strip() # if rest and rest[0] == '-': # range_end = int(re.match('-\s?(\d)+', rest).groups()[0]) # elif rest.startswith('ja'): # range_end = int(rest[2:]) # elif rest.lower().startswith('.a'): # Ksv notation # pass # elif rest.startswith(':'): # ??? # pass # check for '161/3.A' style if not district_id: for l in context['all_text']: m = re.match(r'(\d+)\.ko', l, re.I) if not m: continue district_id = int(m.groups()[0]) break if not district_id: self.logger.warning("No district id found for '%s'" % text) return None else: m = re.match(self.PLAN_UNIT_LONG_MATCH, text) district_id, block_id, unit_id = [int(x) for x in m.groups()[0:3]] rest = m.groups()[3] jhs_id = '091%03d%04d%04d' % (district_id, block_id, unit_id) name = '91-%d-%d-%d' % (district_id, block_id, unit_id) plan_unit = self.plan_unit_map.get(jhs_id, None) prop = self.property_map.get(jhs_id, None) geometry = None if plan_unit: geometry = plan_unit['geometry'] elif prop: geometry = prop['geometry'] else: print("No geometry found for '%s'" % jhs_id) self.logger.warning("No geometry found for '%s'" % jhs_id) self.no_match_plan_units.append([text, jhs_id]) return None self.matches += 1 return {'name': name, 'type': 'plan_unit', 'geometry': geometry} def geocode_district(self, text): return def geocode_from_text(self, text, context): text = text.strip() if not isinstance(text, unicode): text = unicode(text) geometries = {} # Check for plan unit IDs m1 = re.match(self.PLAN_UNIT_SHORT_MATCH, text) m2 = re.match(self.PLAN_UNIT_LONG_MATCH, text) if m1 or m2: geom = self.geocode_plan_unit(text, context) if geom: geom['text'] = text geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom return geometries m = re.match(r'^(\d{3,5})\.[pP]$', text) if m: geom = self.geocode_plan(m.groups()[0]) if geom: geom['text'] = text geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom geometries.update(self.geocode_address(text)) return geometries def geocode_from_text_list(self, text_list): geometries = {} context = {'all_text': text_list} for text in text_list: g = self.geocode_from_text(text, context) geometries.update(g) return [geom for geom_id, geom in geometries.iteritems()] def load_address_database(self, csv_file): reader = csv.reader(csv_file, delimiter=',') reader.next() addr_hash = {} for idx, row in enumerate(reader): row_type = int(row[-2]) if row_type != 1: continue street = row[0].strip() if not row[1]: continue num = int(row[1]) if not num: continue num2 = row[2] if not num2: num2 = None letter = row[3].strip() muni_name = row[10].strip() coord_n = int(row[8]) coord_e = int(row[9]) if muni_name != "Helsinki": continue e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2, 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e} street = street.lower().decode('utf8') num_list = addr_hash.setdefault(street, []) for s in num_list: if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e['letter'] == s['letter']: break else: num_list.append(e) self.street_hash = addr_hash self.street_tree = NoAho() print "%d street names loaded" % len(self.street_hash) for street in self.street_hash.keys(): self.street_tree.add(street) def _load_mapinfo(self, ds, id_field_name, id_fixer=None): geom_map = {} lyr = ds[0] for idx, feat in enumerate(lyr): origin_id = feat[id_field_name].as_string().strip() if id_fixer: origin_id = id_fixer(origin_id) geom = feat.geom geom.srid = GK25_SRID geom.transform(settings.PROJECTION_SRID) if origin_id not in geom_map: plan = {'geometry': None} geom_map[origin_id] = plan else: plan = geom_map[origin_id] poly = GEOSGeometry(geom.wkb, srid=geom.srid) if isinstance(poly, LineString): try: ring = LinearRing(poly.tuple) except Exception: self.logger.error("Skipping plan %s, it's linestring doesn't close." % origin_id) # if the LineString doesn't form a polygon, skip it. continue poly = Polygon(ring) if plan['geometry']: if isinstance(plan['geometry'], Polygon): plan['geometry'] = MultiPolygon(plan['geometry']) if isinstance(poly, MultiPolygon): plan['geometry'].extend(poly) else: plan['geometry'].append(poly) else: plan['geometry'] = poly for key, e in geom_map.items(): geom = e['geometry'] if not geom.valid: self.logger.warning("geometry for %s not OK, fixing" % key) geom = geom.simplify() assert geom.valid e['geometry'] = geom return geom_map def load_plans(self, plan_file, in_effect): if getattr(self, 'all_plans_loaded', False): return if not in_effect: # Okay, this is hacky! try: picklef = open('plans.pickle', 'r') self.plan_map = cPickle.load(picklef) self.all_plans_loaded = True print "%d pickled plans loaded" % len(self.plan_map) return except IOError: pass ds = DataSource(plan_file, encoding='iso8859-1') plan_map = self._load_mapinfo(ds, 'kaavatunnus') print "%d plans imported" % len(plan_map) self.plan_map.update(plan_map) if in_effect: picklef = open('plans.pickle', 'w') cPickle.dump(self.plan_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL) def load_plan_units(self, plan_unit_file): try: picklef = open('plan_units.pickle', 'r') self.plan_unit_map = cPickle.load(picklef) print "%d plan units loaded" % len(self.plan_unit_map) return except IOError: pass ds = DataSource(plan_unit_file, encoding='iso8859-1') self.plan_unit_map = self._load_mapinfo(ds, 'jhstunnus') print "%d plan units imported" % len(self.plan_unit_map) picklef = open('plan_units.pickle', 'w') cPickle.dump(self.plan_unit_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL) def load_properties(self, property_file): try: picklef = open('geo_properties.pickle', 'r') self.property_map = cPickle.load(picklef) print "%d properties loaded" % len(self.property_map) return except IOError: pass def fix_property_id(s): if s[0] != '0': return '0' + s return s ds = DataSource(property_file, encoding='iso8859-1') self.property_map = self._load_mapinfo(ds, 'Kiinteistotunnus', id_fixer=fix_property_id) print "%d properties imported" % len(self.property_map) picklef = open('geo_properties.pickle', 'w') cPickle.dump(self.property_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)
class AhoCorasickTest(unittest.TestCase): def setUp(self): self.tree = NoAho() def tearDown(self): self.tree = None def test_compile_before_use(self): self.tree.add('bar') self.assertRaises(AssertionError, lambda: self.tree.find_short('xxxbaryyy')) self.tree.compile() self.tree.find_short('xxxbaryyy') self.assertRaises(AssertionError, lambda: self.tree.add('foo')) def test_keyword_as_prefix_of_another(self): """According to John, there's a problem with the matcher. this test case should expose the bug.""" self.tree.add('foobar') self.tree.add('foo') self.tree.add('bar') self.tree.compile() self.assertEqual((3, 6, None), self.tree.find_short('xxxfooyyy')) self.assertEqual((0, 3, None), self.tree.find_short('foo')) self.assertEqual((3, 6, None), self.tree.find_short('xxxbaryyy')) def test_another_find(self): """Just to triangulate the search code. We want to make sure that the implementation can do more than one search, at least.""" self.tree.add("Python") self.tree.add("PLT Scheme") self.tree.compile() self.assertEqual((19, 25, None), self.tree.find_short( "I am learning both Python and PLT Scheme")) self.assertEqual((0, 10, None), self.tree.find_short( "PLT Scheme is an interesting language.")) def test_simple_construction(self): self.tree.add("foo") self.tree.add("bar") self.tree.compile() self.assertEqual((10, 13, None), self.tree.find_short("this is a foo message")) self.assertEqual(self.tree.children_count(), 6) def test_find_longest(self): self.tree.add("a") self.tree.add("alphabet") self.tree.compile() self.assertEqual((0, 1, None), self.tree.find_short("alphabet soup")) self.assertEqual((0, 8, None), self.tree.find_long("alphabet soup")) self.assertEqual((13, 14, None), self.tree.find_long( "yummy, I see an alphabet soup bowl")) def test_find_with_whole_match(self): """Make sure that longest search will match the whole string.""" longString = "supercalifragilisticexpialidocious" self.tree.add(longString) self.tree.compile() self.assertEqual((0, len(longString), None), self.tree.find_short(longString)) def test_find_longest_with_whole_match(self): """Make sure that longest search will match the whole string.""" longString = "supercalifragilisticexpialidocious" self.tree.add(longString) self.tree.compile() self.assertEqual((0, len(longString), None), self.tree.find_long(longString)) def test_find_longest_with_no_match(self): self.tree.add("foobar") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_long("fooba")) def test_with_expected_non_match(self): """Check to see that we don't always get a successful match.""" self.tree.add("wise man") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_short( "where fools and wise men fear to tread")) def test_reject_empty_key(self): self.assertRaises(ValueError, self.tree.add, "") def test_empty_construction(self): """Make sure that we can safely construct and dealloc a tree with no initial keywords. Important because the C implementation assumes keywords exist on its dealloc, so we have to do some work on the back end to avoid silly segmentation errors.""" tree = NoAho() del tree def test_embedded_nulls(self): """Check to see if we can accept embedded nulls""" self.tree.add("hell\0 world") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_short("ello\0 world")) self.assertEqual((0, 11, None), self.tree.find_short("hell\0 world")) def test_embedded_nulls_again(self): self.tree.add("\0\0\0") self.tree.compile() self.assertEqual((0, 3, None), self.tree.find_short("\0\0\0\0\0\0\0\0")) def test_findall_and_findall_longest(self): self.tree.add("python") self.tree.add("perl") self.tree.add("scheme") self.tree.add("java") self.tree.add("pythonperl") self.tree.compile() self.assertEqual( [(0, 6, None), (6, 10, None), (10, 16, None), (16, 20, None)], list(self.tree.findall_short("pythonperlschemejava"))) self.assertEqual([(0, 10, None), (10, 16, None), (16, 20, None)], list(self.tree.findall_long("pythonperlschemejava"))) self.assertEqual([], list(self.tree.findall_short("no pascal here"))) self.assertEqual([], list(self.tree.findall_long("no pascal here"))) def test_bug2_competing_longests(self): """Previously we'd return the /last/ key found, now we look forward while there are contiguous candidate keys, and actually return the longest. """ self.tree.add('cisco', 'cisco') self.tree.add('em', 'em') self.tree.add('cisco systems australia', 'cisco systems') self.tree.compile() self.assertEqual([(0, 5, 'cisco'), (10, 12, 'em')], list(self.tree.findall_long('cisco systems'))) def test_bug3_false_terminal_nodes(self): self.tree.add('an', None) self.tree.add('canal', None) self.tree.add('e can oilfield', None) self.tree.compile() self.assertEqual([(4, 4+5, None)], list(self.tree.findall_long('one canal'))) def test_payload(self): class RandomClass(object): def __init__(self): pass obj = RandomClass() self.tree.add("python", "yes-python") self.tree.add("perl", "") self.tree.add("scheme", None) self.tree.add("lisp", [1, 2, 3]) # no payload, comes out None self.tree.add("C++") self.tree.add("dylan", obj) self.tree.compile() self.assertEqual((0, 6, "yes-python"), self.tree.find_short("python")) self.assertEqual((0, 4, ""), self.tree.find_short("perl")) self.assertEqual((0, 6, None), self.tree.find_short("scheme")) self.assertEqual((0, 4, [1, 2, 3]), self.tree.find_short("lisp")) self.assertEqual((0, 3, None), self.tree.find_short("C++")) self.assertEqual((0, 5, obj), self.tree.find_short("dylan")) def test_dict_style_get_and_set(self): self.tree['foo'] = 5 self.assertEqual(5, self.tree['foo']) def test_dict_style_set_empty_key(self): # equivalent to self.tree[''] = None # __setitem__ implements this part of the [] protocol self.assertRaises(ValueError, self.tree.__setitem__, '', None) def test_dict_style_set_nonstring_key(self): # equivalent to self.tree[''] = None # __setitem__ implements this part of the [] protocol self.assertRaises(ValueError, self.tree.__setitem__, 6, None) self.assertRaises(ValueError, self.tree.__setitem__, None, None) self.assertRaises(ValueError, self.tree.__setitem__, [], None) def test_dict_style_get_unseen_key(self): # __getitem__ implements this part of the [] protocol self.assertRaises(KeyError, self.tree.__getitem__, 'unseen') self.assertRaises(KeyError, self.tree.__getitem__, '') def test_dict_style_containment(self): self.tree['foo'] = 5 self.assertEqual(True, 'foo' in self.tree) self.assertEqual(False, '' in self.tree) self.assertEqual(False, 'fo' in self.tree) self.assertEqual(False, 'o' in self.tree) self.assertEqual(False, 'oo' in self.tree) self.assertEqual(False, 'f' in self.tree) def test_dict_style_len(self): self.tree['a'] = None self.tree['b'] = [1, 2] self.tree['c'] = 12 self.assertEqual(3, len(self.tree)) # reminder that we need to figure out which version we're in, and # test Python 2 unicode explicitly @unittest.expectedFailure def test_unicode_in_python2(self): self.assertEqual(True, False) # key iteration is unimplemented @unittest.expectedFailure def test_iteration(self): self.tree.add("Harry") self.tree.add("Hermione") self.tree.add("Ron") self.assertEqual(set("Harry", "Hermione", "Ron"), set(self.tree.keys())) # reminder that we need to implement findall_short @unittest.expectedFailure def test_subset(self): self.tree.add("he") self.tree.add("hers") self.assertEqual([(0, 2, None), (0, 4, None)], list(self.tree.findall_short("hers")))
class Conversation_Constructor: def __init__(self): self.COMMENT_LOWERBOUND = 10 self.COMMENT_UPPERBOUND = 1000 # Deleted comments with less than this number of tokens will not be recorded # thus not considered in comment restoration actions to reduce confusion. self.deleted_records = {} def page_creation(self, rev): page = {} page['page_id'] = rev['page_id'] page['actions'] = {} page['page_title'] = rev['page_title'] page['actions'][0] = (-1, -1) return page def load(self, deleted_comments): """ Load the previous page state, deleted comments and other information """ self.deleted_records = {} self.previous_comments = NoAho() for pair in deleted_comments: self.previous_comments.add(pair[0], (pair[1], int(pair[2]))) self.deleted_records[pair[1]] = True return def convert_diff_format(self, x, a, b): ret = x if x['name'] == 'insert': ret['tokens'] = b[x['b1']:x['b2']] if x['name'] == 'delete': ret['tokens'] = a[x['a1']:x['a2']] return ret def mydiff_toDelta(self, diffs): """Crush the diff into a list of dictionary indicating changes from one document to another. Operations are dictionary record with name (insert, delete, equal) and offsets (in original text and resulted text). Args: diffs: Array of diff tuples. Returns: Deltas. """ text = [] a = 0 b = 0 DIFF_DELETE = -1 DIFF_INSERT = 1 DIFF_EQUAL = 0 for (op, data) in diffs: if op == DIFF_INSERT: yield ({ "name": "insert", "a1": a, "a2": a, "b1": b, "b2": b + len(data) }) b += len(data) elif op == DIFF_DELETE: yield ({ "name": "delete", "a1": a, "a2": a + len(data), "b1": b, "b2": b }) a += len(data) elif op == DIFF_EQUAL: yield ({ "name": "equal", "a1": a, "a2": a + len(data), "b1": b, "b2": b + len(data) }) a += len(data) b += len(data) def clean_dict(self, page, the_dict): """ We only store the information of currently 'alive' actions. Definition of alive: - The action was a deletion happened recently, hence might be restored later. - The action is still present on the page, hence might be modified/removed/replied to. """ keylist = the_dict.keys() ret = the_dict alive_actions = set([action[0] for action in page['actions'].values()]) for action in keylist: if not (action in alive_actions or action in self.deleted_records): del ret[action] return ret def process(self, page_state, latest_content, rev): logging.debug("DEBUGGING MODE on REVISION %s" % rev['rev_id']) memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE BEFORE ANYTHING: %d KB." % memory_usage) # Clean the HTML format of the revision. rev['text'] = clean_html(rev['text']) # Compute the diff between the latest processed revision and the current # one. dmp = dmp_module.diff_match_patch() logging.debug("LENGTH : %d -> %d" % (len(latest_content), len(rev['text']))) diff = dmp.diff_main(latest_content, rev['text'], False) dmp.diff_cleanupSemantic(diff) delta = self.mydiff_toDelta(diff) rev['diff'] = sorted([self.convert_diff_format(x, latest_content, rev['text']) \ for x in delta], key=lambda k: k['a1']) # Create a new page if this page was never processed before. if not (page_state): self.previous_comments = NoAho() old_page = self.page_creation(rev) page_state = {'rev_id': int(rev['rev_id']), \ 'timestamp': rev['timestamp'], \ 'page_id': rev['page_id'], \ 'deleted_comments': [], \ 'conversation_id': {}, \ 'authors': {}, 'ancestor_id': {}} else: page_state['rev_id'] = int(rev['rev_id']) page_state['timestamp'] = rev['timestamp'] old_page = page_state['page_state'] memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE BEFORE PROCESSING: %d KB." % memory_usage) # Process the revision to get the actions and update page state actions, updated_page = insert(rev, old_page, self.previous_comments, self.COMMENT_LOWERBOUND) page_state['page_state'] = updated_page # Post process of the actions: for action in actions: # If the action is adding new content # - locate which conversation does it belong to # - record the name of the author into the author list of the comment if action['type'] == 'ADDITION' or action['type'] == 'MODIFICATION' \ or action['type'] == 'CREATION': if action['replyTo_id'] == None: page_state['conversation_id'][action['id']] = action['id'] else: page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['replyTo_id']] if action['type'] == 'MODIFICATION': page_state['authors'][action['id']] = \ set(page_state['authors'][action['parent_id']]) page_state['authors'][action['id']].add( (action['user_id'], action['user_text'])) page_state['ancestor_id'][action['id']] = \ page_state['ancestor_id'][action['parent_id']] else: page_state['authors'][action['id']] = \ set([(action['user_id'], action['user_text'])]) page_state['ancestor_id'][action['id']] = action['id'] else: page_state['authors'][action['id']] = \ set(page_state['authors'][action['parent_id']]) page_state['ancestor_id'][action['id']] = \ page_state['ancestor_id'][action['parent_id']] # Removed and restored comments are considered # belonging to the same conversation as its original version. if action['type'] == 'DELETION': page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['parent_id']] if action['type'] == 'RESTORATION': page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['parent_id']] action['conversation_id'] = page_state['conversation_id'][ action['id']] action['authors'] = list(page_state['authors'][action['id']]) action['page_id'] = rev['page_id'] action['page_title'] = rev['page_title'] action['cleaned_content'] = clean(action['content']) action['ancestor_id'] = page_state['ancestor_id'][action['id']] # If a comment is deleted, it will be added to a list used for # identifying restoration actions later. Note that comments that # deleted two weeks ago will be removed from the list to ensure # memory efficiency. Also comments that are too long or too short # are ignored in this case. if action['type'] == 'DELETION' and\ len(action['content']) > self.COMMENT_LOWERBOUND and\ len(action['content']) < self.COMMENT_UPPERBOUND: page_state['deleted_comments'].append( (''.join(action['content']), action['parent_id'], action['indentation'])) self.deleted_records[action['parent_id']] = True self.previous_comments.add( ''.join(action['content']), (action['parent_id'], action['indentation'])) page_state['conversation_id'] = self.clean_dict( updated_page, page_state['conversation_id']) page_state['authors'] = self.clean_dict(updated_page, page_state['authors']) # Set is not JSON serializable. page_state['authors'] = { action_id: list(authors) for action_id, authors in page_state['authors'].items() } memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE AFTER POSTPROCESSING: %d KB." % memory_usage) return page_state, actions, rev['text']