def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() test_pages = [] for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) test_pages.append(page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() list_markup = {} list_names = {} if LEARN_LISTS: (list_markup, list_names) = pageManager.learnListMarkups() #This is the div learning train_pages = {} for page_id in pageManager._pages: train_pages[page_id] = pageManager.getPage(page_id).getString() d = DivListLearner() div_rules, div_markup = d.run(train_pages) (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules) for page_id in div_markup: for item in div_markup[page_id]: if item in div_list_markup[page_id]: if 'starting_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location'] if 'ending_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location'] if div_markup[page_id][item]['sequence']: for idx, val in enumerate(div_markup[page_id][item]['sequence']): if len(div_list_markup[page_id][item]['sequence']) <= idx: div_list_markup[page_id][item]['sequence'].insert(idx, val); else: div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location'] div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location'] #Now add these to the list_markup and list_names if len(div_rules.rules) > 0: for page_id in div_list_markup: if page_id not in list_markup: list_markup[page_id] = {} list_markup[page_id].update(div_list_markup[page_id]) list_names.update(div_list_names) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)
def lists_on_single_page(self, content): pg = PageManager() pg.addPage("zzz", content) triples = pg.getVisibleTokenStructure() (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div') potential_lists = self.prefix_tree_to_paths(ptree) if self.__DEBUG: print '.... POTENTIAL LISTS ARE ....' print '\n'.join([''.join(p) for p in potential_lists]) print '.... OK!....' all_tokens_list = pg.getPage("zzz").tokens # Now, let's get our lists lists = {} for i in range(len(potential_lists)): pot_list = potential_lists[i] as_path = ''.join(pot_list) if self.__DEBUG: print "PATH: %s" % as_path lists[as_path] = { 'rows': [] } # if as_path in paths_to_vis_text: for path_to_vis in paths_to_vis_text: if path_to_vis.find(as_path) > -1: vis_texts = [a for a in paths_to_vis_text[path_to_vis]] invis_toks = [t for t in path_to_invis_toks[path_to_vis]] for idx in range(len(vis_texts)): if self.__DEBUG: print "%s ==> %s" % (vis_texts[idx], str(invis_toks[idx].token_location)) html_between_row = '' if (idx+1) < len(vis_texts): begin = invis_toks[idx].token_location end = invis_toks[idx+1].token_location - 1 html_between_row = all_tokens_list.getTokensAsString(begin, end, whitespace=True) lists[as_path]['rows'].append({ 'visible_text': vis_texts[idx], 'starting_token_location': invis_toks[idx].token_location, 'html_between_row': html_between_row }) as_json_str = json.dumps(lists) if self.__DEBUG: print "--------" print as_json_str print "--------" # # do it as an extraction instead? # item_rule_begin = Landmark.escape_regex_string('<html') # item_rule_end = Landmark.escape_regex_string('/html>') # # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list]) # # # figure out: for each tag in the rule, add it's end tag (keep track of tag type) # # NOTE: for now, this assumes that the HTML is well formed # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))]) # # end_iter_rule = end_it # # # include end-regex: included in the stuff that's extracted. # # Solve for the case where you only see part of the stuff # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end, # begin_iter_rule, end_iter_rule, removehtml=True) # extraction = rule.apply(content) # # print "**PATH: "+''.join(pot_list) # as_json_str = json.dumps(extraction) # # for seq in extraction['sequence']: # print "\t"+seq['extract'] # TODO: do this here???? # TODO: big drop down the path should be considered... not just if hte path occurs twice # TODO: fix bugs markup = self.creat_row_markup(lists, all_tokens_list, pg) if self.__DEBUG: print "list markup" json.dumps(markup) return markup
class TruffleShuffle(object): #json lines file is of the CDR format def __init__(self, page_file_dir='/path/to/dir/', json_lines_file=None): self.__page_file_dir = page_file_dir self.__chunkBreakSeparator = '<BRK>' self.__page_manager = PageManager() if json_lines_file: count = 0 myfile = codecs.open(json_lines_file, "r", "utf-8") for line in myfile: count += 1 try: json_object = json.loads(line) the_file = json_object['doc_id'] page_str = json_object['raw_content'] self.__page_manager.addPage(the_file, page_str) except: print 'Unable to process line %d' % count else: files = [f for f in os.listdir(self.__page_file_dir) if os.path.isfile(os.path.join(self.__page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(self.__page_file_dir, the_file), "rU", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') self.__page_manager.addPage(the_file, page_str) def get_chunk_separator(self): return self.__chunkBreakSeparator def get_page_manager(self): return self.__page_manager # so maybe you table randome samples of 3 pages, and induce a template # if you find a template that is similar (or matches) most, then that is the template for this cluster? # or you could do a greedy build or something (e.g., add another page and if it doesn't change, you are good) def sample_and_learn_template(self, cluster_members, sub_sample_size=5, iterations=10): stripes = [] for itr in range(iterations): shuffle(cluster_members) # randomly orders them random_members = cluster_members[0:sub_sample_size] # get the sub-sample template = self.induce_template(random_members) stripe_texts = [] for stripe in template: stripe_text = stripe['stripe'] stripe_texts.append(stripe_text) # now, only keep the top X longest stripes and see what it does... top_x = 10 # for now... stripes_by_size = {} for stpe in stripe_texts: stsz = len(stpe) if stsz not in stripes_by_size: stripes_by_size[stsz] = [] stripes_by_size[stsz].append(stpe) top_sizes = sorted(stripes_by_size.keys(), reverse=True) kept_big_stripes = [] for tsz in top_sizes: kept_big_stripes.extend(stripes_by_size[tsz]) if len(kept_big_stripes) > top_x: break # stripes_string = self.__chunkBreakSeparator.join(stripe_texts) stripes_string = self.__chunkBreakSeparator.join(kept_big_stripes[:top_x]) stripes.append(stripes_string) template_occurrences = {} for tstr in stripes: template_occurrences[tstr] = stripes.count(tstr) for sstring in template_occurrences: if template_occurrences[sstring] > 1: print "Template: %s" % sstring[:250] # just a little bit print "Induced template occurs %d out of %d" % (template_occurrences[sstring], iterations) def induce_template(self, cluster_members): sub_page_mgr = PageManager() for id in cluster_members: curr_page = self.__page_manager.getPage(id) sub_page_mgr.addPage(id, curr_page.string) sub_page_mgr.learnStripes() return sub_page_mgr.getStripes() def prep_truffles_to_shuffle(self): all_chunks = set() page_chunks_map = {} for page_id in self.__page_manager.getPageIds(): page_chunks = self.__page_manager.getPageChunks(page_id) all_chunks.update(page_chunks) page_chunks_map[page_id] = page_chunks chunks_to_remove = set() all_pages_sz = len(self.__page_manager.getPageIds()) for chunk in all_chunks: num_pages_with_chunk = 0 for page_id in self.__page_manager.getPageIds(): if chunk in page_chunks_map[page_id]: num_pages_with_chunk += 1 if num_pages_with_chunk < 10 or num_pages_with_chunk == all_pages_sz: chunks_to_remove.add(chunk) # print str(len(all_chunks)) + " chunks before filtering" all_chunks.difference_update(chunks_to_remove) for page_id in self.__page_manager.getPageIds(): page_chunks_map[page_id].difference_update(chunks_to_remove) # print str(len(all_chunks)) + " chunks left after filtering" # print str(all_pages_sz) + " pages total" return all_chunks, page_chunks_map ############################## # # Clusters pages according to "rules". A "rule" is a list of chunks, and a "chunk" is a section of a Web page # that is visible to a user. # # Inputs: # algorithm: 'rule_size': cluster by the size of rule from long rules to short rules # 'coverage' : cluster by the number of pages covered by a rule, small to big (more specific to less) # # Outputs: # dict[rule] = { # 'MEMBERS': list of page ids (Pids from the PageManager), # 'ANCHOR': the anchoring chunk for this cluster # } # That is, each entry is a rule and its value is a dict. Note that an anchor is unique # Each rule is a string of chunk_1<BRK>chunk_2<BRK>...<BRK>chunk_N # it's a string to make it an index, but to use it you could break on <BRK> # which you can get from the method get_chunk_separator() # ############################## def do_truffle_shuffle(self, algorithm='coverage'): all_chunks, page_chunks_map = self.prep_truffles_to_shuffle() chunk_counts = {} seen_rules = [] rule_anchors = {} for chunk in all_chunks: pages_with_chunk = [] for page_id in self.__page_manager.getPageIds(): if chunk in page_chunks_map[page_id]: pages_with_chunk.append(page_id) other_chunks = set() other_chunks.update(page_chunks_map[pages_with_chunk[0]]) for page_id in pages_with_chunk: other_chunks.intersection_update(page_chunks_map[page_id]) # now, find all the guys that have all of those chunks... if len(other_chunks) > 1: # one token is not enough, enforce that there are at least 2... rule = self.__chunkBreakSeparator.join(other_chunks) if rule not in seen_rules: chunk_counts[rule] = pages_with_chunk rule_anchors[rule] = chunk if algorithm == 'coverage': counts = dict([(rule, len(chunk_counts[rule])) for rule in chunk_counts]) else: # count by the size of the rule, but prefer longer, # so make it negative so we don't need to change sorted() call below (e.g., make rules negative # so that sorted small to large actually gives us longer rules (more negative) to shorter (less neg) counts = dict([(rule, -len(rule.split(self.__chunkBreakSeparator))) for rule in chunk_counts]) inverted = {} for rl in counts: sz = counts[rl] if sz not in inverted: inverted[sz] = [] inverted[sz].append(rl) final_clusters = {} already_clustered = [] for size in sorted(inverted.keys()): rules = inverted[size] for rule in rules: pids = [p for p in chunk_counts[rule] if p not in already_clustered] already_clustered.extend(pids) if len(pids) > 1: final_clusters[rule] = { 'MEMBERS': pids, 'ANCHOR': rule_anchors[rule] } return final_clusters
def lists_on_single_page(self, content): pg = PageManager() pg.addPage("zzz", content) triples = pg.getVisibleTokenStructure() (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div') potential_lists = self.prefix_tree_to_paths(ptree) if self.__DEBUG: print '.... POTENTIAL LISTS ARE ....' print '\n'.join([''.join(p) for p in potential_lists]) print '.... OK!....' all_tokens_list = pg.getPage("zzz").tokens # Now, let's get our lists lists = {} for i in range(len(potential_lists)): pot_list = potential_lists[i] as_path = ''.join(pot_list) if self.__DEBUG: print "PATH: %s" % as_path lists[as_path] = {'rows': []} # if as_path in paths_to_vis_text: for path_to_vis in paths_to_vis_text: if path_to_vis.find(as_path) > -1: vis_texts = [a for a in paths_to_vis_text[path_to_vis]] invis_toks = [t for t in path_to_invis_toks[path_to_vis]] for idx in range(len(vis_texts)): if self.__DEBUG: print "%s ==> %s" % ( vis_texts[idx], str(invis_toks[idx].token_location)) html_between_row = '' if (idx + 1) < len(vis_texts): begin = invis_toks[idx].token_location end = invis_toks[idx + 1].token_location - 1 html_between_row = all_tokens_list.getTokensAsString( begin, end, whitespace=True) lists[as_path]['rows'].append({ 'visible_text': vis_texts[idx], 'starting_token_location': invis_toks[idx].token_location, 'html_between_row': html_between_row }) as_json_str = json.dumps(lists) if self.__DEBUG: print "--------" print as_json_str print "--------" # # do it as an extraction instead? # item_rule_begin = Landmark.escape_regex_string('<html') # item_rule_end = Landmark.escape_regex_string('/html>') # # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list]) # # # figure out: for each tag in the rule, add it's end tag (keep track of tag type) # # NOTE: for now, this assumes that the HTML is well formed # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))]) # # end_iter_rule = end_it # # # include end-regex: included in the stuff that's extracted. # # Solve for the case where you only see part of the stuff # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end, # begin_iter_rule, end_iter_rule, removehtml=True) # extraction = rule.apply(content) # # print "**PATH: "+''.join(pot_list) # as_json_str = json.dumps(extraction) # # for seq in extraction['sequence']: # print "\t"+seq['extract'] # TODO: do this here???? # TODO: big drop down the path should be considered... not just if hte path occurs twice # TODO: fix bugs markup = self.creat_row_markup(lists, all_tokens_list, pg) if self.__DEBUG: print "list markup" json.dumps(markup) return markup