def visible_token_viewer(): if request.method == 'POST': data = request.get_json(force=True) test_string = data['test_string'] test_string = ' '.join(test_string.split()) pageManager = PageManager() page_file_dir = os.path.join(app.static_folder, 'visible_tokens_test') files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) triples = [] for triple in pageManager.getVisibleTokenStructure(): if triple['invisible_token_buffer_before'].endswith(test_string): triples.append(triple) return jsonify(triples=triples)
def lists_on_single_page(self, content): pg = PageManager() pg.addPage("zzz", content) triples = pg.getVisibleTokenStructure() (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div') potential_lists = self.prefix_tree_to_paths(ptree) if self.__DEBUG: print '.... POTENTIAL LISTS ARE ....' print '\n'.join([''.join(p) for p in potential_lists]) print '.... OK!....' all_tokens_list = pg.getPage("zzz").tokens # Now, let's get our lists lists = {} for i in range(len(potential_lists)): pot_list = potential_lists[i] as_path = ''.join(pot_list) if self.__DEBUG: print "PATH: %s" % as_path lists[as_path] = { 'rows': [] } # if as_path in paths_to_vis_text: for path_to_vis in paths_to_vis_text: if path_to_vis.find(as_path) > -1: vis_texts = [a for a in paths_to_vis_text[path_to_vis]] invis_toks = [t for t in path_to_invis_toks[path_to_vis]] for idx in range(len(vis_texts)): if self.__DEBUG: print "%s ==> %s" % (vis_texts[idx], str(invis_toks[idx].token_location)) html_between_row = '' if (idx+1) < len(vis_texts): begin = invis_toks[idx].token_location end = invis_toks[idx+1].token_location - 1 html_between_row = all_tokens_list.getTokensAsString(begin, end, whitespace=True) lists[as_path]['rows'].append({ 'visible_text': vis_texts[idx], 'starting_token_location': invis_toks[idx].token_location, 'html_between_row': html_between_row }) as_json_str = json.dumps(lists) if self.__DEBUG: print "--------" print as_json_str print "--------" # # do it as an extraction instead? # item_rule_begin = Landmark.escape_regex_string('<html') # item_rule_end = Landmark.escape_regex_string('/html>') # # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list]) # # # figure out: for each tag in the rule, add it's end tag (keep track of tag type) # # NOTE: for now, this assumes that the HTML is well formed # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))]) # # end_iter_rule = end_it # # # include end-regex: included in the stuff that's extracted. # # Solve for the case where you only see part of the stuff # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end, # begin_iter_rule, end_iter_rule, removehtml=True) # extraction = rule.apply(content) # # print "**PATH: "+''.join(pot_list) # as_json_str = json.dumps(extraction) # # for seq in extraction['sequence']: # print "\t"+seq['extract'] # TODO: do this here???? # TODO: big drop down the path should be considered... not just if hte path occurs twice # TODO: fix bugs markup = self.creat_row_markup(lists, all_tokens_list, pg) if self.__DEBUG: print "list markup" json.dumps(markup) return markup
def lists_on_single_page(self, content): pg = PageManager() pg.addPage("zzz", content) triples = pg.getVisibleTokenStructure() (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div') potential_lists = self.prefix_tree_to_paths(ptree) if self.__DEBUG: print '.... POTENTIAL LISTS ARE ....' print '\n'.join([''.join(p) for p in potential_lists]) print '.... OK!....' all_tokens_list = pg.getPage("zzz").tokens # Now, let's get our lists lists = {} for i in range(len(potential_lists)): pot_list = potential_lists[i] as_path = ''.join(pot_list) if self.__DEBUG: print "PATH: %s" % as_path lists[as_path] = {'rows': []} # if as_path in paths_to_vis_text: for path_to_vis in paths_to_vis_text: if path_to_vis.find(as_path) > -1: vis_texts = [a for a in paths_to_vis_text[path_to_vis]] invis_toks = [t for t in path_to_invis_toks[path_to_vis]] for idx in range(len(vis_texts)): if self.__DEBUG: print "%s ==> %s" % ( vis_texts[idx], str(invis_toks[idx].token_location)) html_between_row = '' if (idx + 1) < len(vis_texts): begin = invis_toks[idx].token_location end = invis_toks[idx + 1].token_location - 1 html_between_row = all_tokens_list.getTokensAsString( begin, end, whitespace=True) lists[as_path]['rows'].append({ 'visible_text': vis_texts[idx], 'starting_token_location': invis_toks[idx].token_location, 'html_between_row': html_between_row }) as_json_str = json.dumps(lists) if self.__DEBUG: print "--------" print as_json_str print "--------" # # do it as an extraction instead? # item_rule_begin = Landmark.escape_regex_string('<html') # item_rule_end = Landmark.escape_regex_string('/html>') # # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list]) # # # figure out: for each tag in the rule, add it's end tag (keep track of tag type) # # NOTE: for now, this assumes that the HTML is well formed # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))]) # # end_iter_rule = end_it # # # include end-regex: included in the stuff that's extracted. # # Solve for the case where you only see part of the stuff # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end, # begin_iter_rule, end_iter_rule, removehtml=True) # extraction = rule.apply(content) # # print "**PATH: "+''.join(pot_list) # as_json_str = json.dumps(extraction) # # for seq in extraction['sequence']: # print "\t"+seq['extract'] # TODO: do this here???? # TODO: big drop down the path should be considered... not just if hte path occurs twice # TODO: fix bugs markup = self.creat_row_markup(lists, all_tokens_list, pg) if self.__DEBUG: print "list markup" json.dumps(markup) return markup