Пример #1
0
def visible_token_viewer():
    if request.method == 'POST':
        data = request.get_json(force=True)
        test_string = data['test_string']
        test_string = ' '.join(test_string.split())
        pageManager = PageManager()
        page_file_dir = os.path.join(app.static_folder, 'visible_tokens_test')
        files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
        for the_file in files:
            if the_file.startswith('.'):
                continue
            
            with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            
            pageManager.addPage(the_file, page_str)
        triples = []
        for triple in pageManager.getVisibleTokenStructure():
            if triple['invisible_token_buffer_before'].endswith(test_string):
                triples.append(triple)
        return jsonify(triples=triples)
    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {
                'rows': []
            }

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (vis_texts[idx], str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx+1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx+1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text': vis_texts[idx],
                            'starting_token_location': invis_toks[idx].token_location,
                            'html_between_row': html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup
Пример #3
0
    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text,
         path_to_invis_toks) = self.prefix_tree(triples,
                                                only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {'rows': []}

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (
                                vis_texts[idx],
                                str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx + 1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx + 1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(
                                begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text':
                            vis_texts[idx],
                            'starting_token_location':
                            invis_toks[idx].token_location,
                            'html_between_row':
                            html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup