示例#1
0
def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            test_pages = []
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)
                test_pages.append(page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            list_markup = {}
            list_names = {}
            if LEARN_LISTS:
                (list_markup, list_names) = pageManager.learnListMarkups()
                
                #This is the div learning
                train_pages = {}
                for page_id in pageManager._pages:
                    train_pages[page_id] = pageManager.getPage(page_id).getString()
                d = DivListLearner()
                div_rules, div_markup = d.run(train_pages)
                 
                (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules)
                
                for page_id in div_markup:
                    for item in div_markup[page_id]:
                        if item in div_list_markup[page_id]:
                            if 'starting_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location']
                            if 'ending_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location']
                            if div_markup[page_id][item]['sequence']:
                                for idx, val in enumerate(div_markup[page_id][item]['sequence']):
                                    if len(div_list_markup[page_id][item]['sequence']) <= idx:
                                        div_list_markup[page_id][item]['sequence'].insert(idx, val);
                                    else:
                                        div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location']
                                        div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location']
                
                #Now add these to the list_markup and list_names
                if len(div_rules.rules) > 0:
                    for page_id in div_list_markup:
                        if page_id not in list_markup:
                            list_markup[page_id] = {}
                        list_markup[page_id].update(div_list_markup[page_id])
                    list_names.update(div_list_names)
            
            rule_set = pageManager.learnAllRules()
            rule_set.removeBadRules(test_pages)
            
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)
    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {
                'rows': []
            }

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (vis_texts[idx], str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx+1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx+1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text': vis_texts[idx],
                            'starting_token_location': invis_toks[idx].token_location,
                            'html_between_row': html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup
示例#3
0
class TruffleShuffle(object):

    #json lines file is of the CDR format
    def __init__(self, page_file_dir='/path/to/dir/', json_lines_file=None):
        self.__page_file_dir = page_file_dir
        self.__chunkBreakSeparator = '<BRK>'
        self.__page_manager = PageManager()
        
        if json_lines_file:
            count = 0
            myfile = codecs.open(json_lines_file, "r", "utf-8")
            for line in myfile:
                count += 1
                try:
                    json_object = json.loads(line)
                    the_file = json_object['doc_id']
                    page_str = json_object['raw_content']
                    self.__page_manager.addPage(the_file, page_str)
                except:
                    print 'Unable to process line %d' % count
        else:
            files = [f for f in os.listdir(self.__page_file_dir) if os.path.isfile(os.path.join(self.__page_file_dir, f))]
            for the_file in files:
                if the_file.startswith('.'):
                    continue
    
                with codecs.open(os.path.join(self.__page_file_dir, the_file), "rU", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
    
                self.__page_manager.addPage(the_file, page_str)

    def get_chunk_separator(self):
        return self.__chunkBreakSeparator

    def get_page_manager(self):
        return self.__page_manager

    # so maybe you table randome samples of 3 pages, and induce a template
    # if you find a template that is similar (or matches) most, then that is the template for this cluster?
    # or you could do a greedy build or something (e.g., add another page and if it doesn't change, you are good)
    def sample_and_learn_template(self, cluster_members, sub_sample_size=5, iterations=10):
        stripes = []
        for itr in range(iterations):
            shuffle(cluster_members) # randomly orders them
            random_members = cluster_members[0:sub_sample_size] # get the sub-sample
            template = self.induce_template(random_members)

            stripe_texts = []
            for stripe in template:
                stripe_text = stripe['stripe']
                stripe_texts.append(stripe_text)

            # now, only keep the top X longest stripes and see what it does...
            top_x = 10  # for now...
            stripes_by_size = {}
            for stpe in stripe_texts:
                stsz = len(stpe)
                if stsz not in stripes_by_size:
                    stripes_by_size[stsz] = []
                stripes_by_size[stsz].append(stpe)

            top_sizes = sorted(stripes_by_size.keys(), reverse=True)
            kept_big_stripes = []
            for tsz in top_sizes:
                kept_big_stripes.extend(stripes_by_size[tsz])
                if len(kept_big_stripes) > top_x:
                    break
            # stripes_string = self.__chunkBreakSeparator.join(stripe_texts)
            stripes_string = self.__chunkBreakSeparator.join(kept_big_stripes[:top_x])
            stripes.append(stripes_string)

        template_occurrences = {}
        for tstr in stripes:
            template_occurrences[tstr] = stripes.count(tstr)

        for sstring in template_occurrences:
            if template_occurrences[sstring] > 1:
                print "Template: %s" % sstring[:250]  # just a little bit
                print "Induced template occurs %d out of %d" % (template_occurrences[sstring], iterations)

    def induce_template(self, cluster_members):
        sub_page_mgr = PageManager()
        for id in cluster_members:
            curr_page = self.__page_manager.getPage(id)
            sub_page_mgr.addPage(id, curr_page.string)
        sub_page_mgr.learnStripes()
        return sub_page_mgr.getStripes()

    def prep_truffles_to_shuffle(self):
        all_chunks = set()
        page_chunks_map = {}
        for page_id in self.__page_manager.getPageIds():
            page_chunks = self.__page_manager.getPageChunks(page_id)
            all_chunks.update(page_chunks)
            page_chunks_map[page_id] = page_chunks

        chunks_to_remove = set()
        all_pages_sz = len(self.__page_manager.getPageIds())
        for chunk in all_chunks:
            num_pages_with_chunk = 0
            for page_id in self.__page_manager.getPageIds():
                if chunk in page_chunks_map[page_id]:
                    num_pages_with_chunk += 1
            if num_pages_with_chunk < 10 or num_pages_with_chunk == all_pages_sz:
                chunks_to_remove.add(chunk)

#         print str(len(all_chunks)) + " chunks before filtering"
        all_chunks.difference_update(chunks_to_remove)
        for page_id in self.__page_manager.getPageIds():
            page_chunks_map[page_id].difference_update(chunks_to_remove)

#         print str(len(all_chunks)) + " chunks left after filtering"
#         print str(all_pages_sz) + " pages total"
        return all_chunks, page_chunks_map

    ##############################
    #
    # Clusters pages according to "rules". A "rule" is a list of chunks, and a "chunk" is a section of a Web page
    # that is visible to a user.
    #
    # Inputs:
    #   algorithm: 'rule_size': cluster by the size of rule from long rules to short rules
    #               'coverage' : cluster by the number of pages covered by a rule, small to big (more specific to less)
    #
    # Outputs:
    #   dict[rule] = {
    #       'MEMBERS': list of page ids (Pids from the PageManager),
    #       'ANCHOR': the anchoring chunk for this cluster
    #    }
    #   That is, each entry is a rule and its value is a dict. Note that an anchor is unique
    #   Each rule is a string of chunk_1<BRK>chunk_2<BRK>...<BRK>chunk_N
    #   it's a string to make it an index, but to use it you could break on <BRK>
    #  which you can get from the method get_chunk_separator()
    #
    ##############################
    def do_truffle_shuffle(self, algorithm='coverage'):
        all_chunks, page_chunks_map = self.prep_truffles_to_shuffle()
        chunk_counts = {}
        seen_rules = []
        rule_anchors = {}
        for chunk in all_chunks:
            pages_with_chunk = []
            for page_id in self.__page_manager.getPageIds():
                if chunk in page_chunks_map[page_id]:
                    pages_with_chunk.append(page_id)
            other_chunks = set()
            other_chunks.update(page_chunks_map[pages_with_chunk[0]])
            for page_id in pages_with_chunk:
                other_chunks.intersection_update(page_chunks_map[page_id])

            # now, find all the guys that have all of those chunks...
            if len(other_chunks) > 1: # one token is not enough, enforce that there are at least 2...
                rule = self.__chunkBreakSeparator.join(other_chunks)
                if rule not in seen_rules:
                    chunk_counts[rule] = pages_with_chunk
                    rule_anchors[rule] = chunk

        if algorithm == 'coverage':
            counts = dict([(rule, len(chunk_counts[rule])) for rule in chunk_counts])
        else:
            # count by the size of the rule, but prefer longer,
            # so make it negative so we don't need to change sorted() call below (e.g., make rules negative
            # so that sorted small to large actually gives us longer rules (more negative) to shorter (less neg)
            counts = dict([(rule, -len(rule.split(self.__chunkBreakSeparator))) for rule in chunk_counts])

        inverted = {}
        for rl in counts:
            sz = counts[rl]
            if sz not in inverted:
                inverted[sz] = []
            inverted[sz].append(rl)
        final_clusters = {}
        already_clustered = []
        for size in sorted(inverted.keys()):
            rules = inverted[size]
            for rule in rules:
                pids = [p for p in chunk_counts[rule] if p not in already_clustered]
                already_clustered.extend(pids)
                if len(pids) > 1:
                    final_clusters[rule] = {
                        'MEMBERS': pids,
                        'ANCHOR': rule_anchors[rule]
                    }

        return final_clusters
示例#4
0
    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text,
         path_to_invis_toks) = self.prefix_tree(triples,
                                                only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {'rows': []}

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (
                                vis_texts[idx],
                                str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx + 1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx + 1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(
                                begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text':
                            vis_texts[idx],
                            'starting_token_location':
                            invis_toks[idx].token_location,
                            'html_between_row':
                            html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup