Python PageManager.learnRulesFromMarkup示例，learning.PageManager.PageManager.learnRulesFromMarkup Python示例

示例#1

0

显示文件

文件： controllers.py 项目： thezedwards/extraction

def do_learning():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')
        with codecs.open(markup_file, "r", "utf-8") as myfile:
            json_str = myfile.read().encode('utf-8')
        markup = json.loads(json_str)

        pageManager = PageManager()
        for key in markup['__URLS__']:
            page_file = os.path.join(directory, key)
            with codecs.open(page_file, "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            pageManager.addPage(key, page_str)

        markup.pop("__SCHEMA__", None)
        markup.pop("__URLS__", None)

        pageManager.learnStripes(markup)
        rule_set = pageManager.learnRulesFromMarkup(markup)

        rules_file = os.path.join(directory, 'learning', 'rules.json')
        with codecs.open(rules_file, "w", "utf-8") as myfile:
            myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': ')))
            myfile.close()

        return jsonify(rules = json.loads(rule_set.toJson()) )
    abort(404)

示例#2

0

显示文件

文件： controllers.py 项目： saggu/extraction

def do_learning():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')
        with codecs.open(markup_file, "r", "utf-8") as myfile:
            json_str = myfile.read().encode('utf-8')
        markup = json.loads(json_str)

        pageManager = PageManager()
        for key in markup['__URLS__']:
            page_file = os.path.join(directory, key)
            with codecs.open(page_file, "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            pageManager.addPage(key, page_str)

        markup.pop("__SCHEMA__", None)
        markup.pop("__URLS__", None)

        pageManager.learnStripes(markup)
        rule_set = pageManager.learnRulesFromMarkup(markup)

        rules_file = os.path.join(directory, 'learning', 'rules.json')
        with codecs.open(rules_file, "w", "utf-8") as myfile:
            myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': ')))
            myfile.close()

        return jsonify(rules = json.loads(rule_set.toJson()) )
    abort(404)

示例#3

0

显示文件

文件： ListLearner.py 项目： thezedwards/extraction

def main():
    page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test'
    
    pageManager = PageManager()
    files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
    for the_file in files:
        if the_file.startswith('.'):
            continue
        
        with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
            page_str = myfile.read().encode('utf-8')
        
        pageManager.addPage(the_file, page_str)
    
    pageManager.learnStripes()
    (list_markup, list_names) = pageManager.learnListMarkups()
    
    rule_set = pageManager.learnAllRules()
    (markup, names) = pageManager.rulesToMarkup(rule_set)
    
    for key in markup.keys():
        if key in list_markup:
            markup[key].update(list_markup[key])

#     print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))
    
    rule_set = pageManager.learnRulesFromMarkup(list_markup)
    print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

示例#4

0

显示文件

文件： RuleLearner.py 项目： saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])
            
            write_debug_files = False
            
            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage('python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files')
        except getopt.error, msg:
            raise Usage(msg)
        
        logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1])
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager(write_debug_files)
        
        start_time = time.time()
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                
                with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    
                pageManager.addPage(the_file, page_str)
        logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time))
        
        #Read the markups from a file...
        start_time = time.time()
        markups_file = args[1]
        with codecs.open(markups_file, "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)
        
        markups.pop("__SCHEMA__", None)
        markups.pop("__URLS__", None)
        
        logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time))

        pageManager.learnStripes(markups)
        start_time = time.time()
        rule_set = pageManager.learnRulesFromMarkup(markups)
        logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time))
         
        if(len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        else:
            print rule_set.toJson()

示例#5

0

显示文件

文件： RuleLearnerAndExtraction.py 项目： saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.error, msg:
            raise Usage(msg)
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager()
        page_str_array = []
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                
                with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    page_str_array.append(page_str)
                    
                pageManager.addPage(the_file, page_str)
                
        pageManager.learnStripes()
        
        #Read the markups from a file...
        markups_file = args[1]
        with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)
        
        markups.pop("__SCHEMA__", None)
        
        #Before we learn the stripes let's make sure we can open the output file
        pageManager.learnStripes(markups)
        rule_set = pageManager.learnRulesFromMarkup(markups)
        
        if(len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        
        #testing
        flatten = False
        extraction_list = rule_set.extract(page_str_array[0])
        
        if rule_set.validate(extraction_list):
            if flatten:
                print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': '))
            else:
                print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))

示例#6

0

显示文件

    def learn_list_extractors(self, pages):
        page_mgr = PageManager()  #write_debug_files=True)

        markup = {}
        for page in pages:
            page_content = pages[page]
            page_mgr.addPage(page, page_content)
            content_list_markup = self.lists_on_single_page(page_content)
            markup[page] = content_list_markup

#         print '--- MARKUP ---'
#         print json.dumps(markup)
        page_mgr.learnStripes(markups=markup)
        rules = page_mgr.learnRulesFromMarkup(markup)

        # now, for each markup rule, learn a little page manager
        sublist_page_managers = {}
        for page in markup:
            for rule_name in markup[page]:
                if rule_name not in sublist_page_managers:
                    sublist_page_managers[rule_name] = PageManager()
                for rid in range(len(markup[page][rule_name]['sequence'])):
                    row = markup[page][rule_name]['sequence'][rid]
                    sublist_page_managers[rule_name].addPage(
                        page + "html%d" % rid, row['extract'])

        sublist_sub_rules = {}
        for sublist in sublist_page_managers:
            sublist_page_managers[sublist].learnStripes()
            sub_rules = sublist_page_managers[sublist].learnAllRules(
                in_list=True)
            sublist_sub_rules[
                sublist] = sub_rules  # This should match a rule name in the rules...

        count = 1
        for rule in rules.rules:
            #             print "== RULE INFO =="
            #             print str(rule.name)
            rule.set_sub_rules(sublist_sub_rules[rule.name])
            list_name = '_div_list' + format(count, '04')
            for page_id in markup:
                if rule.name in markup[page_id]:
                    markup[page_id][list_name] = markup[page_id].pop(rule.name)
            rule.name = list_name
#             print str(json.dumps(rule.toJson()))
#             print "==============="
#
#         print rules.toJson()

        return rules, markup

示例#7

0

显示文件

文件： TreeListLearner.py 项目： usc-isi-i2/landmark-extraction

    def learn_list_extractors(self, pages):
        page_mgr = PageManager() #write_debug_files=True)

        markup = {}
        for page in pages:
            page_content = pages[page]
            page_mgr.addPage(page, page_content)
            content_list_markup = self.lists_on_single_page(page_content)
            markup[page] = content_list_markup

#         print '--- MARKUP ---'
#         print json.dumps(markup)
        page_mgr.learnStripes(markups=markup)
        rules = page_mgr.learnRulesFromMarkup(markup)


        # now, for each markup rule, learn a little page manager
        sublist_page_managers = {}
        for page in markup:
            for rule_name in markup[page]:
                if rule_name not in sublist_page_managers:
                    sublist_page_managers[rule_name] = PageManager()
                for rid in range(len(markup[page][rule_name]['sequence'])):
                    row = markup[page][rule_name]['sequence'][rid]
                    sublist_page_managers[rule_name].addPage(page+"html%d" % rid, row['extract'])

        sublist_sub_rules = {}
        for sublist in sublist_page_managers:
            sublist_page_managers[sublist].learnStripes()
            sub_rules = sublist_page_managers[sublist].learnAllRules(in_list = True)
            sublist_sub_rules[sublist] = sub_rules  # This should match a rule name in the rules...

        count = 1 
        for rule in rules.rules:
#             print "== RULE INFO =="
#             print str(rule.name)
            rule.set_sub_rules(sublist_sub_rules[rule.name])
            list_name = '_div_list'+format(count, '04')
            for page_id in markup:
                if rule.name in markup[page_id]:
                    markup[page_id][list_name] = markup[page_id].pop(rule.name)
            rule.name = list_name
#             print str(json.dumps(rule.toJson()))
#             print "==============="
#             
#         print rules.toJson()
        
        
        return rules, markup

示例#8

0

显示文件

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.error, msg:
            raise Usage(msg)

        #read the directory location from arg0
        page_file_dir = args[0]

        pageManager = PageManager()
        page_str_array = []
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue

                with codecs.open(os.path.join(subdir, the_file), "r",
                                 "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    page_str_array.append(page_str)

                pageManager.addPage(the_file, page_str)

        pageManager.learnStripes()

        #Read the markups from a file...
        markups_file = args[1]
        with codecs.open(os.path.join('', markups_file), "r",
                         "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)

        markups.pop("__SCHEMA__", None)

        #Before we learn the stripes let's make sure we can open the output file
        pageManager.learnStripes(markups)
        rule_set = pageManager.learnRulesFromMarkup(markups)

        if (len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()

        #testing
        flatten = False
        extraction_list = rule_set.extract(page_str_array[0])

        if rule_set.validate(extraction_list):
            if flatten:
                print json.dumps(Landmark.flattenResult(extraction_list),
                                 sort_keys=True,
                                 indent=2,
                                 separators=(',', ': '))
            else:
                print json.dumps(extraction_list,
                                 sort_keys=True,
                                 indent=2,
                                 separators=(',', ': '))

示例#9

0

显示文件

文件： RuleLearner.py 项目： thezedwards/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])

            write_debug_files = False

            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage(
                        'python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files'
                    )
        except getopt.error, msg:
            raise Usage(msg)

        logger.info('Running RuleLearner with file at %s for rules %s',
                    args[0], args[1])

        #read the directory location from arg0
        page_file_dir = args[0]

        pageManager = PageManager(write_debug_files)

        start_time = time.time()
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue

                with codecs.open(os.path.join(subdir, the_file), "r",
                                 "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')

                pageManager.addPage(the_file, page_str)
        logger.info("--- LOAD PAGES: %s seconds ---" %
                    (time.time() - start_time))

        #Read the markups from a file...
        start_time = time.time()
        markups_file = args[1]
        with codecs.open(markups_file, "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)

        markups.pop("__SCHEMA__", None)
        markups.pop("__URLS__", None)

        logger.info("--- LOAD MARKUPS: %s seconds ---" %
                    (time.time() - start_time))

        pageManager.learnStripes(markups)
        start_time = time.time()
        rule_set = pageManager.learnRulesFromMarkup(markups)
        logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" %
                    (time.time() - start_time))

        if (len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        else:
            print rule_set.toJson()