#!/usr/bin/env python
# -*- coding:utf-8 -*-

from __future__ import unicode_literals
from noaho import NoAho    # 多模式匹配
from collections import Counter, defaultdict
trie = NoAho()
trie.add('hehe')
trie.add('py')
trie.add('python')


txt = """
我是谁不重要,重要的是你要学会python, hehe我是谁不重要,重要的是你要学会python
小米科技有限公司
"""

'''
c = defaultdict(int)
words = [txt[k[0]:k[1]] for k in trie.findall_long(txt)]
wc = Counter(words)

for k in trie.findall_long(txt):
    word = txt[k[0]:k[1]]
    c[word] += 1
    #print(k)
    print(txt[k[0]:k[1]])


for k, v in wc.items():
    print k, v
Пример #2
0
class AhjoGeocoder(object):
    PLAN_UNIT_SHORT_MATCH = r'^(\d{3,5})/(\d+)(.*)$'
    PLAN_UNIT_LONG_MATCH = r'^0?91-(\d+)-(\d+)-(\d+)(.*)$'

    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.no_match_addresses = []
        self.no_match_plans = []
        self.no_match_plan_units = []
        self.plan_map = {}
        self.plan_unit_map = {}
        self.property_map = {}
        self.street_tree = None
        self.matches = 0

    def convert_from_gk25(self, north, east):
        pnt = Point(east, north, srid=GK25_SRID)
        pnt.transform(settings.PROJECTION_SRID)
        return pnt

    def geocode_address(self, text):
        if not self.street_tree:
            return {}

        STREET_SUFFIXES = ('katu', 'tie', 'kuja', 'polku', 'kaari', 'linja', 'raitti', 'rinne', 'penger', 'ranta', u'väylä')
        for sfx in STREET_SUFFIXES:
            m = re.search(r'([A-Z]\w+%s)\s+(\d+)' % sfx, text)
            if not m:
                continue
            street_name = m.groups()[0].lower()
            if street_name not in self.street_hash:
                print "Street name not found: %s" % street_name.encode('utf8')
                self.no_match_addresses.append('%s %s' % (m.groups()[0], m.groups()[1]))
        textl = text.lower()
        ret = [x for x in self.street_tree.findall_long(textl)]
        geometries = {}
        for street_match in ret:
            (start, end) = street_match[0:2]
            street_name = textl[start:end]
            # check for the address number
            m = re.match(r'\s*(\d+)', text[end:])
            if not m:
                #print "\tno address: %s" % text[start:]
                continue
            num = int(m.groups()[0])

            e_list = self.street_hash[street_name]
            for e in e_list:
                if num == e['num']:
                    break
                if e['num_end'] and e['num'] < num <= e['num_end']:
                    break
            else:
                self.logger.warning("No match found for '%s %d'" % (street_name, num))
                s = '%s %d' % (e['street'], num)
                if not s in self.no_match_addresses:
                    self.no_match_addresses.append(s)
                continue

            pnt = self.convert_from_gk25(e['coord_n'], e['coord_e'])
            geom = {'name': '%s %d' % (e['street'], num), 'geometry': pnt,
                    'type': 'address', 'text': text}
            geom_id = "%s/%s" % (geom['type'], geom['name'])
            geometries[geom_id] = geom
        return geometries

    def geocode_plan(self, plan_id):
        plan = self.plan_map.get(plan_id)
        if not plan:
            if plan_id not in self.no_match_plans:
                self.logger.warning("No plan found for plan id %s" % plan_id)
                self.no_match_plans.append(plan_id)
            return
        return {'name': plan_id, 'geometry': plan['geometry'], 'type': 'plan'}

    def geocode_plan_unit(self, text, context):
        # If there are more than one '/' characters, it's not a plan unit
        m = re.match(self.PLAN_UNIT_SHORT_MATCH, text)
        if m:
            if text.count('/') > 1:
                return None
            block_id, unit_id, rest = m.groups()
            block_id = int(block_id)
            unit_id = int(unit_id)
            district_id = block_id // 1000
            block_id %= 1000
            # TODO: Code the logic to extract and use unit
            #       ids from the rest of the match.
            # if rest:
            #     if rest[0].lower() in ('a', 'b', 'c', 'd', 'e'):
            #         rest = rest[1:]
            #     rest = rest.strip()
            #     if rest and rest[0] == '-':
            #         range_end = int(re.match('-\s?(\d)+', rest).groups()[0])
            #     elif rest.startswith('ja'):
            #         range_end = int(rest[2:])
            #     elif rest.lower().startswith('.a'): # Ksv notation
            #         pass
            #     elif rest.startswith(':'): # ???
            #         pass
            # check for '161/3.A' style
            if not district_id:
                for l in context['all_text']:
                    m = re.match(r'(\d+)\.ko', l, re.I)
                    if not m:
                        continue
                    district_id = int(m.groups()[0])
                    break
                if not district_id:
                    self.logger.warning("No district id found for '%s'" % text)
                    return None
        else:
            m = re.match(self.PLAN_UNIT_LONG_MATCH, text)
            district_id, block_id, unit_id = [int(x) for x in m.groups()[0:3]]
            rest = m.groups()[3]

        jhs_id = '091%03d%04d%04d' % (district_id, block_id, unit_id)
        name = '91-%d-%d-%d' % (district_id, block_id, unit_id)
        plan_unit = self.plan_unit_map.get(jhs_id, None)
        prop = self.property_map.get(jhs_id, None)
        geometry = None
        if plan_unit:
            geometry = plan_unit['geometry']
        elif prop:
            geometry = prop['geometry']
        else:
            print("No geometry found for '%s'" % jhs_id)
            self.logger.warning("No geometry found for '%s'" % jhs_id)
            self.no_match_plan_units.append([text, jhs_id])
            return None

        self.matches += 1
        return {'name': name, 'type': 'plan_unit', 'geometry': geometry}

    def geocode_district(self, text):
        return

    def geocode_from_text(self, text, context):
        text = text.strip()
        if not isinstance(text, unicode):
            text = unicode(text)

        geometries = {}

        # Check for plan unit IDs
        m1 = re.match(self.PLAN_UNIT_SHORT_MATCH, text)
        m2 = re.match(self.PLAN_UNIT_LONG_MATCH, text)
        if m1 or m2:
            geom = self.geocode_plan_unit(text, context)
            if geom:
                geom['text'] = text
                geom_id = "%s/%s" % (geom['type'], geom['name'])
                geometries[geom_id] = geom
            return geometries

        m = re.match(r'^(\d{3,5})\.[pP]$', text)
        if m:
            geom = self.geocode_plan(m.groups()[0])
            if geom:
                geom['text'] = text
                geom_id = "%s/%s" % (geom['type'], geom['name'])
                geometries[geom_id] = geom

        geometries.update(self.geocode_address(text))

        return geometries

    def geocode_from_text_list(self, text_list):
        geometries = {}
        context = {'all_text': text_list}
        for text in text_list:
            g = self.geocode_from_text(text, context)
            geometries.update(g)
        return [geom for geom_id, geom in geometries.iteritems()]

    def load_address_database(self, csv_file):
        reader = csv.reader(csv_file, delimiter=',')
        reader.next()
        addr_hash = {}
        for idx, row in enumerate(reader):
            row_type = int(row[-2])
            if row_type != 1:
                continue
            street = row[0].strip()
            if not row[1]:
                continue
            num = int(row[1])
            if not num:
                continue
            num2 = row[2]
            if not num2:
                num2 = None
            letter = row[3].strip()
            muni_name = row[10].strip()
            coord_n = int(row[8])
            coord_e = int(row[9])
            if muni_name != "Helsinki":
                continue
            e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2,
                 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e}
            street = street.lower().decode('utf8')
            num_list = addr_hash.setdefault(street, [])
            for s in num_list:
                if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e['letter'] == s['letter']:
                    break
            else:
                num_list.append(e)

        self.street_hash = addr_hash
        self.street_tree = NoAho()
        print "%d street names loaded" % len(self.street_hash)
        for street in self.street_hash.keys():
            self.street_tree.add(street)

    def _load_mapinfo(self, ds, id_field_name, id_fixer=None):
        geom_map = {}
        lyr = ds[0]
        for idx, feat in enumerate(lyr):
            origin_id = feat[id_field_name].as_string().strip()
            if id_fixer:
                origin_id = id_fixer(origin_id)
            geom = feat.geom
            geom.srid = GK25_SRID
            geom.transform(settings.PROJECTION_SRID)
            if origin_id not in geom_map:
                plan = {'geometry': None}
                geom_map[origin_id] = plan
            else:
                plan = geom_map[origin_id]
            poly = GEOSGeometry(geom.wkb, srid=geom.srid)
            if isinstance(poly, LineString):
                try:
                    ring = LinearRing(poly.tuple)
                except Exception:
                    self.logger.error("Skipping plan %s, it's linestring doesn't close." % origin_id)
                    # if the LineString doesn't form a polygon, skip it.
                    continue
                poly = Polygon(ring)
            if plan['geometry']:
                if isinstance(plan['geometry'], Polygon):
                    plan['geometry'] = MultiPolygon(plan['geometry'])
                if isinstance(poly, MultiPolygon):
                    plan['geometry'].extend(poly)
                else:
                    plan['geometry'].append(poly)
            else:
                plan['geometry'] = poly

        for key, e in geom_map.items():
            geom = e['geometry']
            if not geom.valid:
                self.logger.warning("geometry for %s not OK, fixing" % key)
                geom = geom.simplify()
                assert geom.valid
                e['geometry'] = geom
        return geom_map

    def load_plans(self, plan_file, in_effect):
        if getattr(self, 'all_plans_loaded', False):
            return
        if not in_effect: # Okay, this is hacky!
            try:
                picklef = open('plans.pickle', 'r')
                self.plan_map = cPickle.load(picklef)
                self.all_plans_loaded = True
                print "%d pickled plans loaded" % len(self.plan_map)
                return
            except IOError:
                pass

        ds = DataSource(plan_file, encoding='iso8859-1')

        plan_map = self._load_mapinfo(ds, 'kaavatunnus')
        print "%d plans imported" % len(plan_map)
        self.plan_map.update(plan_map)

        if in_effect:
            picklef = open('plans.pickle', 'w')
            cPickle.dump(self.plan_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)

    def load_plan_units(self, plan_unit_file):
        try:
            picklef = open('plan_units.pickle', 'r')
            self.plan_unit_map = cPickle.load(picklef)
            print "%d plan units loaded" % len(self.plan_unit_map)
            return
        except IOError:
            pass

        ds = DataSource(plan_unit_file, encoding='iso8859-1')

        self.plan_unit_map = self._load_mapinfo(ds, 'jhstunnus')

        print "%d plan units imported" % len(self.plan_unit_map)

        picklef = open('plan_units.pickle', 'w')
        cPickle.dump(self.plan_unit_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)

    def load_properties(self, property_file):
        try:
            picklef = open('geo_properties.pickle', 'r')
            self.property_map = cPickle.load(picklef)
            print "%d properties loaded" % len(self.property_map)
            return
        except IOError:
            pass

        def fix_property_id(s):
            if s[0] != '0':
                return '0' + s
            return s

        ds = DataSource(property_file, encoding='iso8859-1')

        self.property_map = self._load_mapinfo(ds, 'Kiinteistotunnus', id_fixer=fix_property_id)

        print "%d properties imported" % len(self.property_map)

        picklef = open('geo_properties.pickle', 'w')
        cPickle.dump(self.property_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)
Пример #3
0
class AhoCorasickTest(unittest.TestCase):
    def setUp(self):
        self.tree = NoAho()

    def tearDown(self):
        self.tree = None

    def test_compile_before_use(self):
        self.tree.add('bar')
        self.assertRaises(AssertionError,
                          lambda: self.tree.find_short('xxxbaryyy'))
        self.tree.compile()
        self.tree.find_short('xxxbaryyy')
        self.assertRaises(AssertionError, lambda: self.tree.add('foo'))

    def test_keyword_as_prefix_of_another(self):
        """According to John, there's a problem with the matcher.
        this test case should expose the bug."""
        self.tree.add('foobar')
        self.tree.add('foo')
        self.tree.add('bar')
        self.tree.compile()
        self.assertEqual((3, 6, None), self.tree.find_short('xxxfooyyy'))
        self.assertEqual((0, 3, None), self.tree.find_short('foo'))
        self.assertEqual((3, 6, None), self.tree.find_short('xxxbaryyy'))

    def test_another_find(self):
        """Just to triangulate the search code.  We want to make sure
        that the implementation can do more than one search, at
        least."""
        self.tree.add("Python")
        self.tree.add("PLT Scheme")
        self.tree.compile()
        self.assertEqual((19, 25, None), self.tree.find_short(
            "I am learning both Python and PLT Scheme"))
        self.assertEqual((0, 10, None), self.tree.find_short(
            "PLT Scheme is an interesting language."))

    def test_simple_construction(self):
        self.tree.add("foo")
        self.tree.add("bar")
        self.tree.compile()
        self.assertEqual((10, 13, None),
                         self.tree.find_short("this is a foo message"))
        self.assertEqual(self.tree.children_count(), 6)

    def test_find_longest(self):
        self.tree.add("a")
        self.tree.add("alphabet")
        self.tree.compile()
        self.assertEqual((0, 1, None), self.tree.find_short("alphabet soup"))
        self.assertEqual((0, 8, None), self.tree.find_long("alphabet soup"))
        self.assertEqual((13, 14, None), self.tree.find_long(
            "yummy, I see an alphabet soup bowl"))

    def test_find_with_whole_match(self):
        """Make sure that longest search will match the whole string."""
        longString = "supercalifragilisticexpialidocious"
        self.tree.add(longString)
        self.tree.compile()
        self.assertEqual((0, len(longString), None),
                         self.tree.find_short(longString))

    def test_find_longest_with_whole_match(self):
        """Make sure that longest search will match the whole string."""
        longString = "supercalifragilisticexpialidocious"
        self.tree.add(longString)
        self.tree.compile()
        self.assertEqual((0, len(longString), None),
                         self.tree.find_long(longString))

    def test_find_longest_with_no_match(self):
        self.tree.add("foobar")
        self.tree.compile()
        self.assertEqual((None, None, None), self.tree.find_long("fooba"))

    def test_with_expected_non_match(self):
        """Check to see that we don't always get a successful match."""
        self.tree.add("wise man")
        self.tree.compile()
        self.assertEqual((None, None, None), self.tree.find_short(
            "where fools and wise men fear to tread"))

    def test_reject_empty_key(self):
        self.assertRaises(ValueError, self.tree.add, "")

    def test_empty_construction(self):
        """Make sure that we can safely construct and dealloc a tree
        with no initial keywords.  Important because the C
        implementation assumes keywords exist on its dealloc, so we
        have to do some work on the back end to avoid silly segmentation
        errors."""
        tree = NoAho()
        del tree

    def test_embedded_nulls(self):
        """Check to see if we can accept embedded nulls"""
        self.tree.add("hell\0 world")
        self.tree.compile()
        self.assertEqual((None, None, None),
                         self.tree.find_short("ello\0 world"))
        self.assertEqual((0, 11, None), self.tree.find_short("hell\0 world"))

    def test_embedded_nulls_again(self):
        self.tree.add("\0\0\0")
        self.tree.compile()
        self.assertEqual((0, 3, None),
                         self.tree.find_short("\0\0\0\0\0\0\0\0"))

    def test_findall_and_findall_longest(self):
        self.tree.add("python")
        self.tree.add("perl")
        self.tree.add("scheme")
        self.tree.add("java")
        self.tree.add("pythonperl")
        self.tree.compile()
        self.assertEqual(
            [(0, 6, None), (6, 10, None), (10, 16, None), (16, 20, None)],
            list(self.tree.findall_short("pythonperlschemejava")))
        self.assertEqual([(0, 10, None), (10, 16, None), (16, 20, None)],
                         list(self.tree.findall_long("pythonperlschemejava")))
        self.assertEqual([],
                         list(self.tree.findall_short("no pascal here")))
        self.assertEqual([],
                         list(self.tree.findall_long("no pascal here")))

    def test_bug2_competing_longests(self):
        """Previously we'd return the /last/ key found, now we look forward
        while there are contiguous candidate keys, and actually return the
        longest.
        """
        self.tree.add('cisco', 'cisco')
        self.tree.add('em', 'em')
        self.tree.add('cisco systems australia', 'cisco systems')
        self.tree.compile()
        self.assertEqual([(0, 5, 'cisco'), (10, 12, 'em')],
                         list(self.tree.findall_long('cisco systems')))

    def test_bug3_false_terminal_nodes(self):
        self.tree.add('an', None)
        self.tree.add('canal', None)
        self.tree.add('e can oilfield', None)
        self.tree.compile()
        self.assertEqual([(4, 4+5, None)],
                         list(self.tree.findall_long('one canal')))

    def test_payload(self):
        class RandomClass(object):
            def __init__(self):
                pass
        obj = RandomClass()
        self.tree.add("python", "yes-python")
        self.tree.add("perl", "")
        self.tree.add("scheme", None)
        self.tree.add("lisp", [1, 2, 3])
        # no payload, comes out None
        self.tree.add("C++")
        self.tree.add("dylan", obj)
        self.tree.compile()

        self.assertEqual((0, 6, "yes-python"), self.tree.find_short("python"))
        self.assertEqual((0, 4, ""), self.tree.find_short("perl"))
        self.assertEqual((0, 6, None), self.tree.find_short("scheme"))
        self.assertEqual((0, 4, [1, 2, 3]), self.tree.find_short("lisp"))
        self.assertEqual((0, 3, None), self.tree.find_short("C++"))
        self.assertEqual((0, 5, obj), self.tree.find_short("dylan"))

    def test_dict_style_get_and_set(self):
        self.tree['foo'] = 5
        self.assertEqual(5, self.tree['foo'])

    def test_dict_style_set_empty_key(self):
        # equivalent to self.tree[''] = None
        # __setitem__ implements this part of the [] protocol
        self.assertRaises(ValueError, self.tree.__setitem__, '', None)

    def test_dict_style_set_nonstring_key(self):
        # equivalent to self.tree[''] = None
        # __setitem__ implements this part of the [] protocol
        self.assertRaises(ValueError, self.tree.__setitem__, 6, None)
        self.assertRaises(ValueError, self.tree.__setitem__, None, None)
        self.assertRaises(ValueError, self.tree.__setitem__, [], None)

    def test_dict_style_get_unseen_key(self):
        # __getitem__ implements this part of the [] protocol
        self.assertRaises(KeyError, self.tree.__getitem__, 'unseen')
        self.assertRaises(KeyError, self.tree.__getitem__, '')

    def test_dict_style_containment(self):
        self.tree['foo'] = 5
        self.assertEqual(True, 'foo' in self.tree)
        self.assertEqual(False, '' in self.tree)
        self.assertEqual(False, 'fo' in self.tree)
        self.assertEqual(False, 'o' in self.tree)
        self.assertEqual(False, 'oo' in self.tree)
        self.assertEqual(False, 'f' in self.tree)

    def test_dict_style_len(self):
        self.tree['a'] = None
        self.tree['b'] = [1, 2]
        self.tree['c'] = 12
        self.assertEqual(3, len(self.tree))

    # reminder that we need to figure out which version we're in, and
    # test Python 2 unicode explicitly
    @unittest.expectedFailure
    def test_unicode_in_python2(self):
        self.assertEqual(True, False)

    # key iteration is unimplemented
    @unittest.expectedFailure
    def test_iteration(self):
        self.tree.add("Harry")
        self.tree.add("Hermione")
        self.tree.add("Ron")
        self.assertEqual(set("Harry", "Hermione", "Ron"),
                         set(self.tree.keys()))

    # reminder that we need to implement findall_short
    @unittest.expectedFailure
    def test_subset(self):
        self.tree.add("he")
        self.tree.add("hers")
        self.assertEqual([(0, 2, None), (0, 4, None)],
                         list(self.tree.findall_short("hers")))
class Conversation_Constructor:
    def __init__(self):
        self.COMMENT_LOWERBOUND = 10
        self.COMMENT_UPPERBOUND = 1000
        # Deleted comments with less than this number of tokens will not be recorded
        # thus not considered in comment restoration actions to reduce confusion.
        self.deleted_records = {}

    def page_creation(self, rev):
        page = {}
        page['page_id'] = rev['page_id']
        page['actions'] = {}
        page['page_title'] = rev['page_title']
        page['actions'][0] = (-1, -1)
        return page

    def load(self, deleted_comments):
        """
          Load the previous page state, deleted comments and other information
        """
        self.deleted_records = {}
        self.previous_comments = NoAho()
        for pair in deleted_comments:
            self.previous_comments.add(pair[0], (pair[1], int(pair[2])))
            self.deleted_records[pair[1]] = True
        return

    def convert_diff_format(self, x, a, b):
        ret = x
        if x['name'] == 'insert':
            ret['tokens'] = b[x['b1']:x['b2']]
        if x['name'] == 'delete':
            ret['tokens'] = a[x['a1']:x['a2']]
        return ret

    def mydiff_toDelta(self, diffs):
        """Crush the diff into a list of dictionary indicating changes
         from one document to another. Operations are dictionary record
         with name (insert, delete, equal) and offsets (in original text
         and resulted text).

         Args:
           diffs: Array of diff tuples.
         Returns:
           Deltas.
         """
        text = []
        a = 0
        b = 0
        DIFF_DELETE = -1
        DIFF_INSERT = 1
        DIFF_EQUAL = 0

        for (op, data) in diffs:
            if op == DIFF_INSERT:
                yield ({
                    "name": "insert",
                    "a1": a,
                    "a2": a,
                    "b1": b,
                    "b2": b + len(data)
                })
                b += len(data)
            elif op == DIFF_DELETE:
                yield ({
                    "name": "delete",
                    "a1": a,
                    "a2": a + len(data),
                    "b1": b,
                    "b2": b
                })
                a += len(data)
            elif op == DIFF_EQUAL:
                yield ({
                    "name": "equal",
                    "a1": a,
                    "a2": a + len(data),
                    "b1": b,
                    "b2": b + len(data)
                })
                a += len(data)
                b += len(data)

    def clean_dict(self, page, the_dict):
        """
          We only store the information of currently 'alive' actions.
          Definition of alive:
             - The action was a deletion happened recently, hence might be restored later.
             - The action is still present on the page, hence might be modified/removed/replied to.
        """
        keylist = the_dict.keys()
        ret = the_dict
        alive_actions = set([action[0] for action in page['actions'].values()])
        for action in keylist:
            if not (action in alive_actions or action in self.deleted_records):
                del ret[action]
        return ret

    def process(self, page_state, latest_content, rev):
        logging.debug("DEBUGGING MODE on REVISION %s" % rev['rev_id'])
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE BEFORE ANYTHING: %d KB." % memory_usage)
        # Clean the HTML format of the revision.
        rev['text'] = clean_html(rev['text'])
        # Compute the diff between the latest processed revision and the current
        # one.
        dmp = dmp_module.diff_match_patch()
        logging.debug("LENGTH : %d -> %d" %
                      (len(latest_content), len(rev['text'])))
        diff = dmp.diff_main(latest_content, rev['text'], False)
        dmp.diff_cleanupSemantic(diff)
        delta = self.mydiff_toDelta(diff)
        rev['diff'] = sorted([self.convert_diff_format(x, latest_content, rev['text']) \
                              for x in delta], key=lambda k: k['a1'])
        # Create a new page if this page was never processed before.
        if not (page_state):
            self.previous_comments = NoAho()
            old_page = self.page_creation(rev)
            page_state = {'rev_id': int(rev['rev_id']), \
                          'timestamp': rev['timestamp'], \
                          'page_id': rev['page_id'], \
                          'deleted_comments': [], \
                          'conversation_id': {}, \
                          'authors': {},
                          'ancestor_id': {}}
        else:
            page_state['rev_id'] = int(rev['rev_id'])
            page_state['timestamp'] = rev['timestamp']
            old_page = page_state['page_state']
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE BEFORE PROCESSING: %d KB." % memory_usage)
        # Process the revision to get the actions and update page state
        actions, updated_page = insert(rev, old_page, self.previous_comments,
                                       self.COMMENT_LOWERBOUND)
        page_state['page_state'] = updated_page
        # Post process of the actions:
        for action in actions:
            # If the action is adding new content
            # - locate which conversation does it belong to
            # - record the name of the author into the author list of the comment
            if action['type'] == 'ADDITION' or action['type'] == 'MODIFICATION' \
               or action['type'] == 'CREATION':
                if action['replyTo_id'] == None:
                    page_state['conversation_id'][action['id']] = action['id']
                else:
                    page_state['conversation_id'][action['id']] = \
                        page_state['conversation_id'][action['replyTo_id']]
                if action['type'] == 'MODIFICATION':
                    page_state['authors'][action['id']] = \
                        set(page_state['authors'][action['parent_id']])
                    page_state['authors'][action['id']].add(
                        (action['user_id'], action['user_text']))
                    page_state['ancestor_id'][action['id']] = \
                        page_state['ancestor_id'][action['parent_id']]
                else:
                    page_state['authors'][action['id']] = \
                        set([(action['user_id'], action['user_text'])])
                    page_state['ancestor_id'][action['id']] = action['id']
            else:
                page_state['authors'][action['id']] = \
                    set(page_state['authors'][action['parent_id']])
                page_state['ancestor_id'][action['id']] = \
                    page_state['ancestor_id'][action['parent_id']]

            # Removed and restored comments are considered
            # belonging to the same conversation as its original version.
            if action['type'] == 'DELETION':
                page_state['conversation_id'][action['id']] = \
                         page_state['conversation_id'][action['parent_id']]
            if action['type'] == 'RESTORATION':
                page_state['conversation_id'][action['id']] = \
                         page_state['conversation_id'][action['parent_id']]
            action['conversation_id'] = page_state['conversation_id'][
                action['id']]
            action['authors'] = list(page_state['authors'][action['id']])
            action['page_id'] = rev['page_id']
            action['page_title'] = rev['page_title']
            action['cleaned_content'] = clean(action['content'])
            action['ancestor_id'] = page_state['ancestor_id'][action['id']]
            # If a comment is deleted, it will be added to a list used for
            # identifying restoration actions later. Note that comments that
            # deleted two weeks ago will be removed from the list to ensure
            # memory efficiency. Also comments that are too long or too short
            # are ignored in this case.
            if action['type'] == 'DELETION' and\
                len(action['content']) > self.COMMENT_LOWERBOUND and\
                len(action['content']) < self.COMMENT_UPPERBOUND:
                page_state['deleted_comments'].append(
                    (''.join(action['content']), action['parent_id'],
                     action['indentation']))
                self.deleted_records[action['parent_id']] = True
                self.previous_comments.add(
                    ''.join(action['content']),
                    (action['parent_id'], action['indentation']))

        page_state['conversation_id'] = self.clean_dict(
            updated_page, page_state['conversation_id'])
        page_state['authors'] = self.clean_dict(updated_page,
                                                page_state['authors'])
        # Set is not JSON serializable.
        page_state['authors'] = {
            action_id: list(authors)
            for action_id, authors in page_state['authors'].items()
        }
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE AFTER POSTPROCESSING: %d KB." %
                      memory_usage)
        return page_state, actions, rev['text']