Python PageManager示例，learning.PageManager.PageManager Python示例

示例#1

0

显示文件

文件： ListLearner.py 项目： thezedwards/extraction

def main():
    page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test'
    
    pageManager = PageManager()
    files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
    for the_file in files:
        if the_file.startswith('.'):
            continue
        
        with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
            page_str = myfile.read().encode('utf-8')
        
        pageManager.addPage(the_file, page_str)
    
    pageManager.learnStripes()
    (list_markup, list_names) = pageManager.learnListMarkups()
    
    rule_set = pageManager.learnAllRules()
    (markup, names) = pageManager.rulesToMarkup(rule_set)
    
    for key in markup.keys():
        if key in list_markup:
            markup[key].update(list_markup[key])

#     print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))
    
    rule_set = pageManager.learnRulesFromMarkup(list_markup)
    print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

示例#2

0

显示文件

 def __init__(self, page_file_dir='/path/to/dir/', json_lines_file=None):
     self.__page_file_dir = page_file_dir
     self.__chunkBreakSeparator = '<BRK>'
     self.__page_manager = PageManager()
     
     if json_lines_file:
         count = 0
         myfile = codecs.open(json_lines_file, "r", "utf-8")
         for line in myfile:
             count += 1
             try:
                 json_object = json.loads(line)
                 the_file = json_object['doc_id']
                 page_str = json_object['raw_content']
                 self.__page_manager.addPage(the_file, page_str)
             except:
                 print 'Unable to process line %d' % count
     else:
         files = [f for f in os.listdir(self.__page_file_dir) if os.path.isfile(os.path.join(self.__page_file_dir, f))]
         for the_file in files:
             if the_file.startswith('.'):
                 continue
 
             with codecs.open(os.path.join(self.__page_file_dir, the_file), "rU", "utf-8") as myfile:
                 page_str = myfile.read().encode('utf-8')
 
             self.__page_manager.addPage(the_file, page_str)

示例#3

0

显示文件

文件： RuleLearnerAllSlots.py 项目： thezedwards/extraction

def run(page_file_dir, ignore_files=[]):
    test_pages = []
    pageManager = PageManager(write_debug_files)

    if os.path.isfile(page_file_dir):
        with open(page_file_dir) as f:
            urls = f.readlines()
        for url in urls:
            page_url = url.strip()
            req = urllib2.urlopen(page_url)
            page_contents = req.read()
            charset = chardet.detect(page_contents)
            page_encoding = charset['encoding']
            page_str = page_contents.decode(page_encoding).encode('utf-8')
            pageManager.addPage(page_url, page_contents)
            test_pages.append(page_url)
    else:
        files = [
            f for f in os.listdir(page_file_dir)
            if os.path.isfile(os.path.join(page_file_dir, f))
        ]
        for the_file in files:
            if the_file.startswith(
                    '.'
            ) or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files:
                continue

            with codecs.open(os.path.join(page_file_dir, the_file), "r",
                             "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')

            pageManager.addPage(the_file, page_str)
            test_pages.append(the_file)

    pageManager.learnStripes()

    ##table, ul, etc. list learning
    #         (list_markup, list_names) = pageManager.learnListMarkups()
    #         list_rules = pageManager.learnRulesFromMarkup(list_markup)

    ##div learning
    #         train_pages = {}
    #         for page_id in pageManager._pages:
    #             train_pages[page_id] = pageManager.getPage(page_id).getString()
    #         d = DivListLearner()
    #         div_rules, div_markup = d.run(train_pages)

    rule_set = pageManager.learnAllRules()
    rule_set.removeBadRules(test_pages)

    #         for rule in list_rules.rules:
    #             rule_set.add_rule(rule)
    #
    #         for rule in div_rules.rules:
    #             rule_set.add_rule(rule)
    return rule_set

示例#4

0

显示文件

文件： controllers.py 项目： saggu/extraction

def markup_on_page():
    if request.method == 'POST':
        data = request.get_json(force=True)
        file_name = data['file_name']
        project_folder = data['project_folder']

        markup = data['markup']

        sample_file = os.path.join(app.static_folder, 'project_folders', project_folder, file_name)
        
        with codecs.open(sample_file, "r", "utf-8") as myfile:
            page_str = myfile.read().encode('utf-8')

        page_manager = PageManager()
        page_manager.addPage(file_name, page_str)
        shortest_pairs = page_manager.getPossibleLocations(file_name, markup)
        return jsonify(shortest_pairs = shortest_pairs)

示例#5

0

显示文件

文件： controllers.py 项目： thezedwards/extraction

def markup_on_page():
    if request.method == 'POST':
        data = request.get_json(force=True)
        file_name = data['file_name']
        project_folder = data['project_folder']

        markup = data['markup']

        sample_file = os.path.join(app.static_folder, 'project_folders', project_folder, file_name)
        
        with codecs.open(sample_file, "r", "utf-8") as myfile:
            page_str = myfile.read().encode('utf-8')

        page_manager = PageManager()
        page_manager.addPage(file_name, page_str)
        shortest_pairs = page_manager.getPossibleLocations(file_name, markup)
        return jsonify(shortest_pairs = shortest_pairs)

示例#6

0

显示文件

文件： controllers.py 项目： thezedwards/extraction

def do_learning():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')
        with codecs.open(markup_file, "r", "utf-8") as myfile:
            json_str = myfile.read().encode('utf-8')
        markup = json.loads(json_str)

        pageManager = PageManager()
        for key in markup['__URLS__']:
            page_file = os.path.join(directory, key)
            with codecs.open(page_file, "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            pageManager.addPage(key, page_str)

        markup.pop("__SCHEMA__", None)
        markup.pop("__URLS__", None)

        pageManager.learnStripes(markup)
        rule_set = pageManager.learnRulesFromMarkup(markup)

        rules_file = os.path.join(directory, 'learning', 'rules.json')
        with codecs.open(rules_file, "w", "utf-8") as myfile:
            myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': ')))
            myfile.close()

        return jsonify(rules = json.loads(rule_set.toJson()) )
    abort(404)

示例#7

0

显示文件

 def induce_template(self, cluster_members):
     sub_page_mgr = PageManager()
     for id in cluster_members:
         curr_page = self.__page_manager.getPage(id)
         sub_page_mgr.addPage(id, curr_page.string)
     sub_page_mgr.learnStripes()
     return sub_page_mgr.getStripes()

示例#8

0

显示文件

文件： RuleLearnerAndExtraction.py 项目： saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.error, msg:
            raise Usage(msg)
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager()
        page_str_array = []
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                
                with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    page_str_array.append(page_str)
                    
                pageManager.addPage(the_file, page_str)
                
        pageManager.learnStripes()
        
        #Read the markups from a file...
        markups_file = args[1]
        with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)
        
        markups.pop("__SCHEMA__", None)
        
        #Before we learn the stripes let's make sure we can open the output file
        pageManager.learnStripes(markups)
        rule_set = pageManager.learnRulesFromMarkup(markups)
        
        if(len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        
        #testing
        flatten = False
        extraction_list = rule_set.extract(page_str_array[0])
        
        if rule_set.validate(extraction_list):
            if flatten:
                print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': '))
            else:
                print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))

示例#9

0

显示文件

文件： controllers.py 项目： thezedwards/extraction

def visible_token_viewer():
    if request.method == 'POST':
        data = request.get_json(force=True)
        test_string = data['test_string']
        test_string = ' '.join(test_string.split())
        pageManager = PageManager()
        page_file_dir = os.path.join(app.static_folder, 'visible_tokens_test')
        files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
        for the_file in files:
            if the_file.startswith('.'):
                continue
            
            with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            
            pageManager.addPage(the_file, page_str)
        triples = []
        for triple in pageManager.getVisibleTokenStructure():
            if triple['invisible_token_buffer_before'].endswith(test_string):
                triples.append(triple)
        return jsonify(triples=triples)

示例#10

0

显示文件

文件： RuleLearnerAllSlots.py 项目： saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])
            
            write_debug_files = False
            
            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage('python -m learning.RuleLearnerAllSlots [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] \n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files')
        except getopt.error, msg:
            raise Usage(msg)
        
        logger.info('Running RuleLearnerAllSlots All Slots with files at %s', args[0])
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager(write_debug_files)
        
        if os.path.isfile(page_file_dir):
            with open(page_file_dir) as f:
                urls = f.readlines()
            for url in urls:
                page_url = url.strip()
                req = urllib2.urlopen(page_url)
                page_contents = req.read()
                charset = chardet.detect(page_contents)
                page_encoding = charset['encoding']
                page_str = page_contents.decode(page_encoding).encode('utf-8')
                pageManager.addPage(page_url, page_contents)
        else:
            files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                 
                with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                     
                pageManager.addPage(the_file, page_str)

        pageManager.learnStripes()
        rule_set = pageManager.learnAllRules()
          
        print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

示例#11

0

显示文件

文件： RuleLearnerAllSlots.py 项目： usc-isi-i2/landmark-extraction

def run(page_file_dir, ignore_files = []):
    test_pages = []
    pageManager = PageManager(write_debug_files)
    
    if os.path.isfile(page_file_dir):
        with open(page_file_dir) as f:
            urls = f.readlines()
        for url in urls:
            page_url = url.strip()
            req = urllib2.urlopen(page_url)
            page_contents = req.read()
            charset = chardet.detect(page_contents)
            page_encoding = charset['encoding']
            page_str = page_contents.decode(page_encoding).encode('utf-8')
            pageManager.addPage(page_url, page_contents)
            test_pages.append(page_url)
    else:
        files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
        for the_file in files:
            if the_file.startswith('.') or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files:
                continue
             
            with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
                 
            pageManager.addPage(the_file, page_str)
            test_pages.append(the_file)

    pageManager.learnStripes()
    
    ##table, ul, etc. list learning
#         (list_markup, list_names) = pageManager.learnListMarkups()
#         list_rules = pageManager.learnRulesFromMarkup(list_markup)
    
    ##div learning
#         train_pages = {}
#         for page_id in pageManager._pages:
#             train_pages[page_id] = pageManager.getPage(page_id).getString()
#         d = DivListLearner()
#         div_rules, div_markup = d.run(train_pages)
    
    rule_set = pageManager.learnAllRules()
    rule_set.removeBadRules(test_pages)
    
#         for rule in list_rules.rules:
#             rule_set.add_rule(rule)
#         
#         for rule in div_rules.rules:
#             rule_set.add_rule(rule)
    return rule_set

示例#12

0

显示文件

文件： controllers.py 项目： saggu/extraction

def do_learning():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')
        with codecs.open(markup_file, "r", "utf-8") as myfile:
            json_str = myfile.read().encode('utf-8')
        markup = json.loads(json_str)

        pageManager = PageManager()
        for key in markup['__URLS__']:
            page_file = os.path.join(directory, key)
            with codecs.open(page_file, "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            pageManager.addPage(key, page_str)

        markup.pop("__SCHEMA__", None)
        markup.pop("__URLS__", None)

        pageManager.learnStripes(markup)
        rule_set = pageManager.learnRulesFromMarkup(markup)

        rules_file = os.path.join(directory, 'learning', 'rules.json')
        with codecs.open(rules_file, "w", "utf-8") as myfile:
            myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': ')))
            myfile.close()

        return jsonify(rules = json.loads(rule_set.toJson()) )
    abort(404)

示例#13

0

显示文件

文件： RuleLearner.py 项目： saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])
            
            write_debug_files = False
            
            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage('python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files')
        except getopt.error, msg:
            raise Usage(msg)
        
        logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1])
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager(write_debug_files)
        
        start_time = time.time()
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                
                with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    
                pageManager.addPage(the_file, page_str)
        logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time))
        
        #Read the markups from a file...
        start_time = time.time()
        markups_file = args[1]
        with codecs.open(markups_file, "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)
        
        markups.pop("__SCHEMA__", None)
        markups.pop("__URLS__", None)
        
        logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time))

        pageManager.learnStripes(markups)
        start_time = time.time()
        rule_set = pageManager.learnRulesFromMarkup(markups)
        logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time))
         
        if(len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        else:
            print rule_set.toJson()

示例#14

0

显示文件

    def learn_list_extractors(self, pages):
        page_mgr = PageManager()  #write_debug_files=True)

        markup = {}
        for page in pages:
            page_content = pages[page]
            page_mgr.addPage(page, page_content)
            content_list_markup = self.lists_on_single_page(page_content)
            markup[page] = content_list_markup

#         print '--- MARKUP ---'
#         print json.dumps(markup)
        page_mgr.learnStripes(markups=markup)
        rules = page_mgr.learnRulesFromMarkup(markup)

        # now, for each markup rule, learn a little page manager
        sublist_page_managers = {}
        for page in markup:
            for rule_name in markup[page]:
                if rule_name not in sublist_page_managers:
                    sublist_page_managers[rule_name] = PageManager()
                for rid in range(len(markup[page][rule_name]['sequence'])):
                    row = markup[page][rule_name]['sequence'][rid]
                    sublist_page_managers[rule_name].addPage(
                        page + "html%d" % rid, row['extract'])

        sublist_sub_rules = {}
        for sublist in sublist_page_managers:
            sublist_page_managers[sublist].learnStripes()
            sub_rules = sublist_page_managers[sublist].learnAllRules(
                in_list=True)
            sublist_sub_rules[
                sublist] = sub_rules  # This should match a rule name in the rules...

        count = 1
        for rule in rules.rules:
            #             print "== RULE INFO =="
            #             print str(rule.name)
            rule.set_sub_rules(sublist_sub_rules[rule.name])
            list_name = '_div_list' + format(count, '04')
            for page_id in markup:
                if rule.name in markup[page_id]:
                    markup[page_id][list_name] = markup[page_id].pop(rule.name)
            rule.name = list_name
#             print str(json.dumps(rule.toJson()))
#             print "==============="
#
#         print rules.toJson()

        return rules, markup

示例#15

0

显示文件

文件： TruffleShuffle.py 项目： saggu/extraction

    def __init__(self, page_file_dir='/path/to/dir/'):
        self.__page_file_dir = page_file_dir
        self.__chunkBreakSeparator = '<BRK>'
        self.__page_manager = PageManager()

        files = [f for f in os.listdir(self.__page_file_dir) if os.path.isfile(os.path.join(self.__page_file_dir, f))]
        for the_file in files:
            if the_file.startswith('.'):
                continue

            with codecs.open(os.path.join(self.__page_file_dir, the_file), "rU", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')

            self.__page_manager.addPage(the_file, page_str)

示例#16

0

显示文件

文件： TreeListLearner.py 项目： usc-isi-i2/landmark-extraction

    def learn_list_extractors(self, pages):
        page_mgr = PageManager() #write_debug_files=True)

        markup = {}
        for page in pages:
            page_content = pages[page]
            page_mgr.addPage(page, page_content)
            content_list_markup = self.lists_on_single_page(page_content)
            markup[page] = content_list_markup

#         print '--- MARKUP ---'
#         print json.dumps(markup)
        page_mgr.learnStripes(markups=markup)
        rules = page_mgr.learnRulesFromMarkup(markup)


        # now, for each markup rule, learn a little page manager
        sublist_page_managers = {}
        for page in markup:
            for rule_name in markup[page]:
                if rule_name not in sublist_page_managers:
                    sublist_page_managers[rule_name] = PageManager()
                for rid in range(len(markup[page][rule_name]['sequence'])):
                    row = markup[page][rule_name]['sequence'][rid]
                    sublist_page_managers[rule_name].addPage(page+"html%d" % rid, row['extract'])

        sublist_sub_rules = {}
        for sublist in sublist_page_managers:
            sublist_page_managers[sublist].learnStripes()
            sub_rules = sublist_page_managers[sublist].learnAllRules(in_list = True)
            sublist_sub_rules[sublist] = sub_rules  # This should match a rule name in the rules...

        count = 1 
        for rule in rules.rules:
#             print "== RULE INFO =="
#             print str(rule.name)
            rule.set_sub_rules(sublist_sub_rules[rule.name])
            list_name = '_div_list'+format(count, '04')
            for page_id in markup:
                if rule.name in markup[page_id]:
                    markup[page_id][list_name] = markup[page_id].pop(rule.name)
            rule.name = list_name
#             print str(json.dumps(rule.toJson()))
#             print "==============="
#             
#         print rules.toJson()
        
        
        return rules, markup

示例#17

0

显示文件

文件： controllers.py 项目： thezedwards/extraction

def autolearn_grid():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        page_urls = data['urls']
        
        page_manager = PageManager()
        results = {}
        for page_url in page_urls:
            page_contents = urllib2.urlopen(page_url).read()
            page_manager.addPage(page_url, page_contents)
            
        page_manager.learnStripes()
        rule_set = page_manager.learnAllRules()
        results['rules'] = json.loads(rule_set.toJson())
        
        return jsonify(results)
    
    abort(404)

示例#18

0

显示文件

文件： controllers.py 项目： saggu/extraction

def autolearn_grid():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        page_urls = data['urls']
        
        page_manager = PageManager()
        results = {}
        for page_url in page_urls:
            page_contents = urllib2.urlopen(page_url).read()
            page_manager.addPage(page_url, page_contents)
            
        page_manager.learnStripes()
        rule_set = page_manager.learnAllRules()
        results['rules'] = json.loads(rule_set.toJson())
        
        return jsonify(results)
    
    abort(404)

示例#19

0

显示文件

文件： TreeListLearner.py 项目： usc-isi-i2/landmark-extraction

    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {
                'rows': []
            }

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (vis_texts[idx], str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx+1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx+1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text': vis_texts[idx],
                            'starting_token_location': invis_toks[idx].token_location,
                            'html_between_row': html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup

示例#20

0

显示文件

文件： controllers.py 项目： saggu/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            (list_markup, list_names) = pageManager.learnListMarkups()
            rule_set = pageManager.learnAllRules()
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)

示例#21

0

显示文件

文件： RuleLearner.py 项目： thezedwards/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])

            write_debug_files = False

            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage(
                        'python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files'
                    )
        except getopt.error, msg:
            raise Usage(msg)

        logger.info('Running RuleLearner with file at %s for rules %s',
                    args[0], args[1])

        #read the directory location from arg0
        page_file_dir = args[0]

        pageManager = PageManager(write_debug_files)

        start_time = time.time()
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue

                with codecs.open(os.path.join(subdir, the_file), "r",
                                 "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')

                pageManager.addPage(the_file, page_str)
        logger.info("--- LOAD PAGES: %s seconds ---" %
                    (time.time() - start_time))

        #Read the markups from a file...
        start_time = time.time()
        markups_file = args[1]
        with codecs.open(markups_file, "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)

        markups.pop("__SCHEMA__", None)
        markups.pop("__URLS__", None)

        logger.info("--- LOAD MARKUPS: %s seconds ---" %
                    (time.time() - start_time))

        pageManager.learnStripes(markups)
        start_time = time.time()
        rule_set = pageManager.learnRulesFromMarkup(markups)
        logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" %
                    (time.time() - start_time))

        if (len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        else:
            print rule_set.toJson()

示例#22

0

显示文件

文件： controllers.py 项目： thezedwards/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            test_pages = []
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)
                test_pages.append(page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            list_markup = {}
            list_names = {}
            if LEARN_LISTS:
                (list_markup, list_names) = pageManager.learnListMarkups()
                
                #This is the div learning
                train_pages = {}
                for page_id in pageManager._pages:
                    train_pages[page_id] = pageManager.getPage(page_id).getString()
                d = DivListLearner()
                div_rules, div_markup = d.run(train_pages)
                 
                (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules)
                
                for page_id in div_markup:
                    for item in div_markup[page_id]:
                        if item in div_list_markup[page_id]:
                            if 'starting_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location']
                            if 'ending_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location']
                            if div_markup[page_id][item]['sequence']:
                                for idx, val in enumerate(div_markup[page_id][item]['sequence']):
                                    if len(div_list_markup[page_id][item]['sequence']) <= idx:
                                        div_list_markup[page_id][item]['sequence'].insert(idx, val);
                                    else:
                                        div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location']
                                        div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location']
                
                #Now add these to the list_markup and list_names
                if len(div_rules.rules) > 0:
                    for page_id in div_list_markup:
                        if page_id not in list_markup:
                            list_markup[page_id] = {}
                        list_markup[page_id].update(div_list_markup[page_id])
                    list_names.update(div_list_names)
            
            rule_set = pageManager.learnAllRules()
            rule_set.removeBadRules(test_pages)
            
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)

示例#23

0

显示文件

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.error, msg:
            raise Usage(msg)

        #read the directory location from arg0
        page_file_dir = args[0]

        pageManager = PageManager()
        page_str_array = []
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue

                with codecs.open(os.path.join(subdir, the_file), "r",
                                 "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    page_str_array.append(page_str)

                pageManager.addPage(the_file, page_str)

        pageManager.learnStripes()

        #Read the markups from a file...
        markups_file = args[1]
        with codecs.open(os.path.join('', markups_file), "r",
                         "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)

        markups.pop("__SCHEMA__", None)

        #Before we learn the stripes let's make sure we can open the output file
        pageManager.learnStripes(markups)
        rule_set = pageManager.learnRulesFromMarkup(markups)

        if (len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()

        #testing
        flatten = False
        extraction_list = rule_set.extract(page_str_array[0])

        if rule_set.validate(extraction_list):
            if flatten:
                print json.dumps(Landmark.flattenResult(extraction_list),
                                 sort_keys=True,
                                 indent=2,
                                 separators=(',', ': '))
            else:
                print json.dumps(extraction_list,
                                 sort_keys=True,
                                 indent=2,
                                 separators=(',', ': '))

示例#24

0

显示文件

    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text,
         path_to_invis_toks) = self.prefix_tree(triples,
                                                only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {'rows': []}

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (
                                vis_texts[idx],
                                str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx + 1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx + 1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(
                                begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text':
                            vis_texts[idx],
                            'starting_token_location':
                            invis_toks[idx].token_location,
                            'html_between_row':
                            html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup

示例#25

0

显示文件

class TruffleShuffle(object):

    #json lines file is of the CDR format
    def __init__(self, page_file_dir='/path/to/dir/', json_lines_file=None):
        self.__page_file_dir = page_file_dir
        self.__chunkBreakSeparator = '<BRK>'
        self.__page_manager = PageManager()
        
        if json_lines_file:
            count = 0
            myfile = codecs.open(json_lines_file, "r", "utf-8")
            for line in myfile:
                count += 1
                try:
                    json_object = json.loads(line)
                    the_file = json_object['doc_id']
                    page_str = json_object['raw_content']
                    self.__page_manager.addPage(the_file, page_str)
                except:
                    print 'Unable to process line %d' % count
        else:
            files = [f for f in os.listdir(self.__page_file_dir) if os.path.isfile(os.path.join(self.__page_file_dir, f))]
            for the_file in files:
                if the_file.startswith('.'):
                    continue
    
                with codecs.open(os.path.join(self.__page_file_dir, the_file), "rU", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
    
                self.__page_manager.addPage(the_file, page_str)

    def get_chunk_separator(self):
        return self.__chunkBreakSeparator

    def get_page_manager(self):
        return self.__page_manager

    # so maybe you table randome samples of 3 pages, and induce a template
    # if you find a template that is similar (or matches) most, then that is the template for this cluster?
    # or you could do a greedy build or something (e.g., add another page and if it doesn't change, you are good)
    def sample_and_learn_template(self, cluster_members, sub_sample_size=5, iterations=10):
        stripes = []
        for itr in range(iterations):
            shuffle(cluster_members) # randomly orders them
            random_members = cluster_members[0:sub_sample_size] # get the sub-sample
            template = self.induce_template(random_members)

            stripe_texts = []
            for stripe in template:
                stripe_text = stripe['stripe']
                stripe_texts.append(stripe_text)

            # now, only keep the top X longest stripes and see what it does...
            top_x = 10  # for now...
            stripes_by_size = {}
            for stpe in stripe_texts:
                stsz = len(stpe)
                if stsz not in stripes_by_size:
                    stripes_by_size[stsz] = []
                stripes_by_size[stsz].append(stpe)

            top_sizes = sorted(stripes_by_size.keys(), reverse=True)
            kept_big_stripes = []
            for tsz in top_sizes:
                kept_big_stripes.extend(stripes_by_size[tsz])
                if len(kept_big_stripes) > top_x:
                    break
            # stripes_string = self.__chunkBreakSeparator.join(stripe_texts)
            stripes_string = self.__chunkBreakSeparator.join(kept_big_stripes[:top_x])
            stripes.append(stripes_string)

        template_occurrences = {}
        for tstr in stripes:
            template_occurrences[tstr] = stripes.count(tstr)

        for sstring in template_occurrences:
            if template_occurrences[sstring] > 1:
                print "Template: %s" % sstring[:250]  # just a little bit
                print "Induced template occurs %d out of %d" % (template_occurrences[sstring], iterations)

    def induce_template(self, cluster_members):
        sub_page_mgr = PageManager()
        for id in cluster_members:
            curr_page = self.__page_manager.getPage(id)
            sub_page_mgr.addPage(id, curr_page.string)
        sub_page_mgr.learnStripes()
        return sub_page_mgr.getStripes()

    def prep_truffles_to_shuffle(self):
        all_chunks = set()
        page_chunks_map = {}
        for page_id in self.__page_manager.getPageIds():
            page_chunks = self.__page_manager.getPageChunks(page_id)
            all_chunks.update(page_chunks)
            page_chunks_map[page_id] = page_chunks

        chunks_to_remove = set()
        all_pages_sz = len(self.__page_manager.getPageIds())
        for chunk in all_chunks:
            num_pages_with_chunk = 0
            for page_id in self.__page_manager.getPageIds():
                if chunk in page_chunks_map[page_id]:
                    num_pages_with_chunk += 1
            if num_pages_with_chunk < 10 or num_pages_with_chunk == all_pages_sz:
                chunks_to_remove.add(chunk)

#         print str(len(all_chunks)) + " chunks before filtering"
        all_chunks.difference_update(chunks_to_remove)
        for page_id in self.__page_manager.getPageIds():
            page_chunks_map[page_id].difference_update(chunks_to_remove)

#         print str(len(all_chunks)) + " chunks left after filtering"
#         print str(all_pages_sz) + " pages total"
        return all_chunks, page_chunks_map

    ##############################
    #
    # Clusters pages according to "rules". A "rule" is a list of chunks, and a "chunk" is a section of a Web page
    # that is visible to a user.
    #
    # Inputs:
    #   algorithm: 'rule_size': cluster by the size of rule from long rules to short rules
    #               'coverage' : cluster by the number of pages covered by a rule, small to big (more specific to less)
    #
    # Outputs:
    #   dict[rule] = {
    #       'MEMBERS': list of page ids (Pids from the PageManager),
    #       'ANCHOR': the anchoring chunk for this cluster
    #    }
    #   That is, each entry is a rule and its value is a dict. Note that an anchor is unique
    #   Each rule is a string of chunk_1<BRK>chunk_2<BRK>...<BRK>chunk_N
    #   it's a string to make it an index, but to use it you could break on <BRK>
    #  which you can get from the method get_chunk_separator()
    #
    ##############################
    def do_truffle_shuffle(self, algorithm='coverage'):
        all_chunks, page_chunks_map = self.prep_truffles_to_shuffle()
        chunk_counts = {}
        seen_rules = []
        rule_anchors = {}
        for chunk in all_chunks:
            pages_with_chunk = []
            for page_id in self.__page_manager.getPageIds():
                if chunk in page_chunks_map[page_id]:
                    pages_with_chunk.append(page_id)
            other_chunks = set()
            other_chunks.update(page_chunks_map[pages_with_chunk[0]])
            for page_id in pages_with_chunk:
                other_chunks.intersection_update(page_chunks_map[page_id])

            # now, find all the guys that have all of those chunks...
            if len(other_chunks) > 1: # one token is not enough, enforce that there are at least 2...
                rule = self.__chunkBreakSeparator.join(other_chunks)
                if rule not in seen_rules:
                    chunk_counts[rule] = pages_with_chunk
                    rule_anchors[rule] = chunk

        if algorithm == 'coverage':
            counts = dict([(rule, len(chunk_counts[rule])) for rule in chunk_counts])
        else:
            # count by the size of the rule, but prefer longer,
            # so make it negative so we don't need to change sorted() call below (e.g., make rules negative
            # so that sorted small to large actually gives us longer rules (more negative) to shorter (less neg)
            counts = dict([(rule, -len(rule.split(self.__chunkBreakSeparator))) for rule in chunk_counts])

        inverted = {}
        for rl in counts:
            sz = counts[rl]
            if sz not in inverted:
                inverted[sz] = []
            inverted[sz].append(rl)
        final_clusters = {}
        already_clustered = []
        for size in sorted(inverted.keys()):
            rules = inverted[size]
            for rule in rules:
                pids = [p for p in chunk_counts[rule] if p not in already_clustered]
                already_clustered.extend(pids)
                if len(pids) > 1:
                    final_clusters[rule] = {
                        'MEMBERS': pids,
                        'ANCHOR': rule_anchors[rule]
                    }

        return final_clusters