def do_learning(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') with codecs.open(markup_file, "r", "utf-8") as myfile: json_str = myfile.read().encode('utf-8') markup = json.loads(json_str) pageManager = PageManager() for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) markup.pop("__SCHEMA__", None) markup.pop("__URLS__", None) pageManager.learnStripes(markup) rule_set = pageManager.learnRulesFromMarkup(markup) rules_file = os.path.join(directory, 'learning', 'rules.json') with codecs.open(rules_file, "w", "utf-8") as myfile: myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(rules = json.loads(rule_set.toJson()) ) abort(404)
def main(): page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test' pageManager = PageManager() files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) # print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')) rule_set = pageManager.learnRulesFromMarkup(list_markup) print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage('python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files') except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) start_time = time.time() for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time)) #Read the markups from a file... start_time = time.time() markups_file = args[1] with codecs.open(markups_file, "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) markups.pop("__URLS__", None) logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time)) pageManager.learnStripes(markups) start_time = time.time() rule_set = pageManager.learnRulesFromMarkup(markups) logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time)) if(len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() else: print rule_set.toJson()
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager() page_str_array = [] for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') page_str_array.append(page_str) pageManager.addPage(the_file, page_str) pageManager.learnStripes() #Read the markups from a file... markups_file = args[1] with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) #Before we learn the stripes let's make sure we can open the output file pageManager.learnStripes(markups) rule_set = pageManager.learnRulesFromMarkup(markups) if(len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() #testing flatten = False extraction_list = rule_set.extract(page_str_array[0]) if rule_set.validate(extraction_list): if flatten: print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': ')) else: print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))
def learn_list_extractors(self, pages): page_mgr = PageManager() #write_debug_files=True) markup = {} for page in pages: page_content = pages[page] page_mgr.addPage(page, page_content) content_list_markup = self.lists_on_single_page(page_content) markup[page] = content_list_markup # print '--- MARKUP ---' # print json.dumps(markup) page_mgr.learnStripes(markups=markup) rules = page_mgr.learnRulesFromMarkup(markup) # now, for each markup rule, learn a little page manager sublist_page_managers = {} for page in markup: for rule_name in markup[page]: if rule_name not in sublist_page_managers: sublist_page_managers[rule_name] = PageManager() for rid in range(len(markup[page][rule_name]['sequence'])): row = markup[page][rule_name]['sequence'][rid] sublist_page_managers[rule_name].addPage( page + "html%d" % rid, row['extract']) sublist_sub_rules = {} for sublist in sublist_page_managers: sublist_page_managers[sublist].learnStripes() sub_rules = sublist_page_managers[sublist].learnAllRules( in_list=True) sublist_sub_rules[ sublist] = sub_rules # This should match a rule name in the rules... count = 1 for rule in rules.rules: # print "== RULE INFO ==" # print str(rule.name) rule.set_sub_rules(sublist_sub_rules[rule.name]) list_name = '_div_list' + format(count, '04') for page_id in markup: if rule.name in markup[page_id]: markup[page_id][list_name] = markup[page_id].pop(rule.name) rule.name = list_name # print str(json.dumps(rule.toJson())) # print "===============" # # print rules.toJson() return rules, markup
def learn_list_extractors(self, pages): page_mgr = PageManager() #write_debug_files=True) markup = {} for page in pages: page_content = pages[page] page_mgr.addPage(page, page_content) content_list_markup = self.lists_on_single_page(page_content) markup[page] = content_list_markup # print '--- MARKUP ---' # print json.dumps(markup) page_mgr.learnStripes(markups=markup) rules = page_mgr.learnRulesFromMarkup(markup) # now, for each markup rule, learn a little page manager sublist_page_managers = {} for page in markup: for rule_name in markup[page]: if rule_name not in sublist_page_managers: sublist_page_managers[rule_name] = PageManager() for rid in range(len(markup[page][rule_name]['sequence'])): row = markup[page][rule_name]['sequence'][rid] sublist_page_managers[rule_name].addPage(page+"html%d" % rid, row['extract']) sublist_sub_rules = {} for sublist in sublist_page_managers: sublist_page_managers[sublist].learnStripes() sub_rules = sublist_page_managers[sublist].learnAllRules(in_list = True) sublist_sub_rules[sublist] = sub_rules # This should match a rule name in the rules... count = 1 for rule in rules.rules: # print "== RULE INFO ==" # print str(rule.name) rule.set_sub_rules(sublist_sub_rules[rule.name]) list_name = '_div_list'+format(count, '04') for page_id in markup: if rule.name in markup[page_id]: markup[page_id][list_name] = markup[page_id].pop(rule.name) rule.name = list_name # print str(json.dumps(rule.toJson())) # print "===============" # # print rules.toJson() return rules, markup
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager() page_str_array = [] for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') page_str_array.append(page_str) pageManager.addPage(the_file, page_str) pageManager.learnStripes() #Read the markups from a file... markups_file = args[1] with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) #Before we learn the stripes let's make sure we can open the output file pageManager.learnStripes(markups) rule_set = pageManager.learnRulesFromMarkup(markups) if (len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() #testing flatten = False extraction_list = rule_set.extract(page_str_array[0]) if rule_set.validate(extraction_list): if flatten: print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': ')) else: print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage( 'python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files' ) except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) start_time = time.time() for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time)) #Read the markups from a file... start_time = time.time() markups_file = args[1] with codecs.open(markups_file, "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) markups.pop("__URLS__", None) logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time)) pageManager.learnStripes(markups) start_time = time.time() rule_set = pageManager.learnRulesFromMarkup(markups) logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time)) if (len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() else: print rule_set.toJson()