def run(self, page_dict):

        article_xpath = comment_xpaths = post_xpaths = None

        content = page_dict.get("content")

        sd = SDAlgorithm()
        sd.content = content
        type, article, comments, multiple = sd.analyze_page()

        if type in ['article', 'comment']:
            article_xpath = element2path(sd.tree, article.root_node)
            # create a few variants of the xpath
            article_xpaths = standardizexpath(sd.tree, article_xpath)
            #print '\n'.join(str(path) for path in article_xpaths)

        if type in ['comment']:
            comment_root_paths = []
            for comment in comments:
                comment_root_paths.append(
                    element2path(sd.tree, comment.root_node))
            # look for the regularity in the comments' paths
            comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(
                comment_root_paths)
            if diff_middle.isdigit():
                #print 'regularity found'
                commenttemplatepath = common_beg + xpathops.STANDARD_REPLACEMENT_STRING + common_end
                # create a few variants of the xpath
                comment_xpaths = findcommonstandardxpath(
                    sd.tree, commenttemplatepath, comment_root_paths)
                #print '\n'.join(str(path) for path in comment_xpaths)

        if type in ['multiple']:
            posts_root_paths = []
            for post in multiple:
                posts_root_paths.append(element2path(sd.tree, post.root_node))
            # look for the regularity in the comments' paths
            posts_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(
                posts_root_paths)
            if diff_middle.isdigit():
                #print 'regularity found'
                posttemplatepath = common_beg + xpathops.STANDARD_REPLACEMENT_STRING + common_end
                # create a few variants of the xpath
                post_xpaths = findcommonstandardxpath(sd.tree,
                                                      posttemplatepath,
                                                      posts_root_paths)
                #print '\n'.join(str(path) for path in post_xpaths)

        return article_xpath, comment_xpaths, post_xpaths
    def find(self,params):
        
        tree = self.page_dict.get("tree")
        
        candidates = []
        candidates.extend(self.__searchInputNodes())
        candidates.extend(self.__searchAnchorNodes())
        candidates.extend(self.__searchButtonNodes())
        candidates.extend(self.__searchImageNodes())
        if candidates:
            sorted_candidates = sorted(candidates,key=lambda x: -x[1])
            
#             for elem in sorted_candidates:
#                 print elem[0],'->',elem[1]
#                 print html.tostring(elem[0])
#                 print '#######################################################\n'
            
            element = sorted_candidates[0]
            elements_list = [element]
            for i in range(1,len(candidates)):
                if element[1] == candidates[i][1]:
                    elements_list.append(candidates[i])
                else:
                    break
            self.nodepath = []
            for elem in elements_list:
                self.nodepath.append(xpathops.element2path(tree, elem[0]))
            self.features = [element[1]]
            self.certainty = 1 if element[1] >= 1 else 0
            return self.nodepath
        else:
            self.features = [0]
            self.certainty = 0
            self.nodepath = None
            return self.nodepath
 def run(self, page_dict):
     
     article_xpath = comment_xpaths = post_xpaths = None
     
     content = page_dict.get("content")
     
     sd = SDAlgorithm()
     sd.content = content
     type,article,comments,multiple = sd.analyze_page()
     
     if type in ['article','comment']:
         article_xpath = element2path(sd.tree,article.root_node)
         # create a few variants of the xpath
         article_xpaths = standardizexpath(sd.tree,article_xpath)
         #print '\n'.join(str(path) for path in article_xpaths)
     
     if type in ['comment']:
         comment_root_paths = []
         for comment in comments:
             comment_root_paths.append(element2path(sd.tree,comment.root_node))
         # look for the regularity in the comments' paths
         comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(comment_root_paths)
         if diff_middle.isdigit():
             #print 'regularity found'
             commenttemplatepath = common_beg+xpathops.STANDARD_REPLACEMENT_STRING+common_end
             # create a few variants of the xpath
             comment_xpaths = findcommonstandardxpath(sd.tree,commenttemplatepath,comment_root_paths)
             #print '\n'.join(str(path) for path in comment_xpaths)
     
     if type in ['multiple']:
         posts_root_paths = []
         for post in multiple:
             posts_root_paths.append(element2path(sd.tree,post.root_node))
         # look for the regularity in the comments' paths
         posts_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(posts_root_paths)
         if diff_middle.isdigit():
             #print 'regularity found'
             posttemplatepath = common_beg+xpathops.STANDARD_REPLACEMENT_STRING+common_end
             # create a few variants of the xpath
             post_xpaths = findcommonstandardxpath(sd.tree,posttemplatepath,posts_root_paths)
             #print '\n'.join(str(path) for path in post_xpaths)
             
     return article_xpath, comment_xpaths, post_xpaths
    def classify_page(self):
        """
        Characterize the page according to i) has main article (has_article()),
        ii) has main article with comments (is_full_article()), iii) has multiple 
        opinions like a forum (is_discussion()).
        """
        validated = False

        [biggest_regions, grouped_comments] = self.group_regions()
        [article_exists, article] = self.has_article(biggest_regions)

        if article_exists:
            max_group = self.get_candidate_article(article, grouped_comments)

            if grouped_comments.has_key(max_group):
                if grouped_comments != {}:
                    validated = self.candidate_group_level_validated(
                        max_group, article, grouped_comments)

                context_validated = self.candidate_context_validated(
                    article, grouped_comments, max_group)
                if self.big_areas_in_same_level(article, grouped_comments,
                                                max_group) and not validated:
                    print Tcolors.INFO + " Multiple similar regions detected!"
                    print "Class: "
                    print Tcolors.RES + " " + grouped_comments[max_group][
                        0].class_name
                    print "Texts: "
                    for reg in grouped_comments[max_group]:
                        print element2path(reg.tree,
                                           reg.root_node), reg.full_text
                    return None, None, grouped_comments[max_group]
                elif not context_validated:
                    print
                    self.print_article(article)
                    print
                    print Tcolors.INFO + " No comments found."
                    return article, None, None
                elif context_validated:
                    print
                    print Tcolors.INFO + " Article with comments detected!"
                    self.print_article(article)
                    print
                    print "Comment class:"
                    print Tcolors.RES + " " + max_group
                    print "Comments:"
                    for com in grouped_comments[max_group]:
                        print element2path(com.tree,
                                           com.root_node), com.full_text
                    return article, grouped_comments[max_group], None
            else:
                self.print_article(article)
                return article, None, None
        else:
            print Tcolors.INFO + " Multiple similar regions detected!"
            print Tcolors.RES
            print "Texts: "
            for reg in biggest_regions:
                print element2path(reg.tree, reg.root_node), reg.full_text
            return None, None, biggest_regions
    def process_multiple_details(self, multiple):
        #content_list = []
        density_list = []
        distance_list = []
        multiple_article_root_paths = []

        for article in multiple:
            #content_list.append(article.contents)
            multiple_article_root_paths.append(element2path(self.sd.tree, article.root_node))
            density_list.append(article.density)
            distance_list.append(article.distance_from_root)

        return multiple_article_root_paths, density_list, distance_list
    def process_comment_details(self, comments):
        #content_list = []
        comment_root_paths = []
        density_list = []
        distance_list = []

        for comment in comments:
            #content_list.append(comment.contents)
            comment_root_paths.append(element2path(self.sd.tree, comment.root_node))
            density_list.append(comment.density)
            distance_list.append(comment.disance_from_root)

        return comment_root_paths, density_list, distance_list
示例#7
0
    def process_multiple_details(self, multiple):
        #content_list = []
        density_list = []
        distance_list = []
        multiple_article_root_paths = []

        for article in multiple:
            #content_list.append(article.contents)
            multiple_article_root_paths.append(element2path(self.sd.tree, article.root_node))
            density_list.append(article.density)
            distance_list.append(article.distance_from_root)

        return multiple_article_root_paths, density_list, distance_list
示例#8
0
    def process_comment_details(self, comments):
        #content_list = []
        comment_root_paths = []
        density_list = []
        distance_list = []

        for comment in comments:
            #content_list.append(comment.contents)
            comment_root_paths.append(element2path(self.sd.tree, comment.root_node))
            density_list.append(comment.density)
            distance_list.append(comment.disance_from_root)

        return comment_root_paths, density_list, distance_list
    def verify_comments(self, comments, tree):
        comment_root_paths = []

        for com in comments:
            comment_root_paths.append(element2path(tree, com.root_node))

        comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(comment_root_paths)

        if diff_middle.isdigit():
            print 'Comments: regularity found'
            return True
        else:
            return False
示例#10
0
    def verify_comments(self, comments, tree):
        comment_root_paths = []

        for com in comments:
            comment_root_paths.append(element2path(tree, com.root_node))

        comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(
            comment_root_paths)

        if diff_middle.isdigit():
            print 'Comments: regularity found'
            return True
        else:
            return False
    def verify_multiple_articles(self, mulart, url, tree, content):
        mul_art_root_paths = []
        multiple_article_text = []

        for mul in mulart:
            multiple_article_text.append(mul.full_text.encode('utf-8', 'ignore'))
            mul_art_root_paths.append(element2path(tree, mul.root_node))

        mul_art_root_paths, multiple_article_text = self.leave_roots_only(mul_art_root_paths, multiple_article_text)
        if(len(mul_art_root_paths)>1):
            struct_verified = self.verify_multiple_articles_pagetags_structure(mul_art_root_paths)
            href_verified = self.verify_multiple_article_hrefs(url, content)
            sim_text_verified = self.verify_similar_text(multiple_article_text, url)
            if struct_verified and href_verified and sim_text_verified:
                return True
            else:
                return False
        else:
            return False # but actually it means : that's an article
示例#12
0
    def verify_multiple_articles(self, mulart, url, tree, content):
        mul_art_root_paths = []
        multiple_article_text = []

        for mul in mulart:
            multiple_article_text.append(
                mul.full_text.encode('utf-8', 'ignore'))
            mul_art_root_paths.append(element2path(tree, mul.root_node))

        mul_art_root_paths, multiple_article_text = self.leave_roots_only(
            mul_art_root_paths, multiple_article_text)
        if (len(mul_art_root_paths) > 1):
            struct_verified = self.verify_multiple_articles_pagetags_structure(
                mul_art_root_paths)
            href_verified = self.verify_multiple_article_hrefs(url, content)
            sim_text_verified = self.verify_similar_text(
                multiple_article_text, url)
            if struct_verified and href_verified and sim_text_verified:
                return True
            else:
                return False
        else:
            return False  # but actually it means : that's an article