def __nodes_containing_attrs(self,
                                 root="",
                                 dict_entries=None,
                                 elem="",
                                 attr=None,
                                 lang='en'):

        if not dict_entries:
            return None

        tree = self.page_dict.get("tree")

        if attr == None: attr = "@*"
        if elem == None: elem = "*"

        lang_dict = lang_dicts.get(lang)
        en_dict = lang_dicts.get('en')

        xpath = ""
        xpath += root
        xpath += "//"
        xpath += elem
        xpath += "["

        words_set = set()
        for entry in dict_entries:
            for word in lang_dict.get(entry):
                words_set.add(word)
            if lang != 'en':
                for word in en_dict.get(entry):
                    words_set.add(word)

        xpath += "@*["

        for word in words_set:
            xpath += "contains(.,'" + word.lower() + "')"
            xpath += " or "
            xpath += "contains(.,'" + word.title() + "')"
            xpath += " or "

        xpath = xpath[:-4]
        xpath += "]]"

        nodes = tree.xpath(xpath, namespaces=namespaces)
        if nodes:
            eval_nodes = []
            regexdictentries = regexofdictentries(dict_entries, lang=lang)
            for node in nodes:
                attrvals = node.values()
                attrvals = [
                    attr for attr in attrvals
                    if re.search(regexdictentries, attr)
                ]
                if attrvals:
                    eval_nodes.append(node)
            return eval_nodes
        else:
            return nodes
 def find(self,params):
     '''
     Look for the word "price" in the text and attributes values
     ''' 
     result = None
     
     tree = self.page_dict.get("tree")
     lang = self.page_dict.get("lang")
     text_nodes = []
     attr_vals = []    
     
     # get a list of all nonempty strings from body (except scripts)
     if "text_nodes" in self.page_dict:
         text_nodes = self.page_dict["text_nodes"]
         text_nodes_paths = self.page_dict["text_nodes_paths"]
     else:  
         text_nodes, text_nodes_paths = tree2textnodeslist(tree)
         self.page_dict["text_nodes"] = text_nodes
         self.page_dict["text_nodes_paths"] = text_nodes_paths
         
     # get a list of all attribute values from the body (except scripts)
     if "attr_vals" in self.page_dict:
         attr_vals = self.page_dict["attr_vals"]
     else:  
         attr_vals = tree2attributevalslist(tree)
         self.page_dict["attr_vals"] = attr_vals
     
     dict_entries = ["PRICE"]
     
     priceregex = regexofdictentries(dict_entries, lang)
         
     '''
     CORE
     '''          
     counter1 = counter2 = 0
     for text in text_nodes:
         #text = str(text)
         if re.search(priceregex,text):
             counter1 += 1
     for val in attr_vals:
         #val = str(val)
         if re.search(priceregex,val):
             counter2 += 1   
     '''
     '''  
             
     self.features = [counter1,counter2]
     
     if counter1 + counter2 > 10:
         result = self.certainty = 1
     else:
         result = self.certainty = 0
     
     return result
예제 #3
0
    def find(self, params):
        '''
        Look for the word "price" in the text and attributes values
        '''
        result = None

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
        text_nodes = []
        attr_vals = []

        # get a list of all nonempty strings from body (except scripts)
        if "text_nodes" in self.page_dict:
            text_nodes = self.page_dict["text_nodes"]
            text_nodes_paths = self.page_dict["text_nodes_paths"]
        else:
            text_nodes, text_nodes_paths = tree2textnodeslist(tree)
            self.page_dict["text_nodes"] = text_nodes
            self.page_dict["text_nodes_paths"] = text_nodes_paths

        # get a list of all attribute values from the body (except scripts)
        if "attr_vals" in self.page_dict:
            attr_vals = self.page_dict["attr_vals"]
        else:
            attr_vals = tree2attributevalslist(tree)
            self.page_dict["attr_vals"] = attr_vals

        dict_entries = ["PRICE"]

        priceregex = regexofdictentries(dict_entries, lang)
        '''
        CORE
        '''
        counter1 = counter2 = 0
        for text in text_nodes:
            #text = str(text)
            if re.search(priceregex, text):
                counter1 += 1
        for val in attr_vals:
            #val = str(val)
            if re.search(priceregex, val):
                counter2 += 1
        '''
        '''

        self.features = [counter1, counter2]

        if counter1 + counter2 > 10:
            result = self.certainty = 1
        else:
            result = self.certainty = 0

        return result
 def __nodes_containing_attrs(self,root="",dict_entries=None,elem="",attr=None,lang='en'):
     
     if not dict_entries:
         return None
     
     tree = self.page_dict.get("tree")
     
     if attr == None: attr = "@*"
     if elem == None: elem = "*"
     
     lang_dict = lang_dicts.get(lang)
     en_dict = lang_dicts.get('en')
     
     xpath = ""
     xpath += root
     xpath += "//"
     xpath += elem
     xpath += "["
     
     words_set = set()
     for entry in dict_entries:
         for word in lang_dict.get(entry):
             words_set.add(word)
         if lang != 'en':
             for word in en_dict.get(entry):
                 words_set.add(word)
                 
     xpath += "@*["
         
     for word in words_set:
         xpath += "contains(.,'"+word.lower()+"')"
         xpath += " or "
         xpath += "contains(.,'"+word.title()+"')"
         xpath += " or "
 
     xpath = xpath[:-4]
     xpath += "]]"
     
     nodes = tree.xpath(xpath,namespaces=namespaces)
     if nodes:
         eval_nodes = []
         regexdictentries = regexofdictentries(dict_entries, lang=lang)
         for node in nodes:
             attrvals = node.values()
             attrvals = [attr for attr in attrvals if re.search(regexdictentries,attr)]
             if attrvals:
                 eval_nodes.append(node)
         return eval_nodes
     else:
         return nodes
예제 #5
0
    def find(self, params):
        '''
        Look for the word "wishlist" in the anchors
        '''
        result = None

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
        a_text_nodes = []

        # get a list of all nonempty strings from body (only children of anchors)
        if "a_text_nodes" in self.page_dict:
            a_text_nodes = self.page_dict["a_text_nodes"]
            text_nodes_paths = self.page_dict["a_text_nodes_paths"]
        else:
            a_text_nodes, text_nodes_paths = tree2textnodeslist(tree,
                                                                element="a")
            self.page_dict["a_text_nodes"] = a_text_nodes
            self.page_dict["a_text_nodes_paths"] = text_nodes_paths

        dict_entries = ["WISHLIST"]

        wishregex = regexofdictentries(dict_entries, lang)
        '''
        CORE
        '''
        counter1 = 0
        for text in a_text_nodes:
            #text = str(text)
            if re.search(wishregex, text):
                counter1 += 1
        '''
        '''

        self.features = [counter1]

        if counter1 > 1:
            result = self.certainty = 1
        elif counter1 == 1:
            result = self.certainty = 0.5
        else:
            result = self.certainty = 0

        return result
예제 #6
0
    def find(self, params):
        '''
        Look for the shipping and returns information
        '''
        result = None

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
        text_nodes = []

        # get a list of all nonempty strings from body (only children of anchors)
        if "text_nodes" in self.page_dict:
            text_nodes = self.page_dict["text_nodes"]
            text_nodes_paths = self.page_dict["text_nodes_paths"]
        else:
            text_nodes, text_nodes_paths = tree2textnodeslist(tree,
                                                              element="a")
            self.page_dict["text_nodes"] = text_nodes
            self.page_dict["text_nodes_paths"] = text_nodes_paths

        dict_entries = ["SHIPPING", "RETURNS"]

        shipregex = regexofdictentries(dict_entries, lang)
        '''
        CORE
        '''
        counter1 = 0
        for text in text_nodes:
            #text = str(text)
            if re.search(shipregex, text):
                counter1 += 1
        '''
        '''

        self.features = [counter1]

        if counter1 > 3:
            result = self.certainty = 1
        elif counter1 > 1:
            result = self.certainty = 0.5
        else:
            result = self.certainty = 0

        return result
 def find(self,params):
     '''
     Look for the word "wishlist" in the anchors
     ''' 
     result = None
     
     tree = self.page_dict.get("tree")
     lang = self.page_dict.get("lang")
     a_text_nodes = []
     
     # get a list of all nonempty strings from body (only children of anchors)
     if "a_text_nodes" in self.page_dict:
         a_text_nodes = self.page_dict["a_text_nodes"]
         text_nodes_paths = self.page_dict["a_text_nodes_paths"]
     else:  
         a_text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a")
         self.page_dict["a_text_nodes"] = a_text_nodes
         self.page_dict["a_text_nodes_paths"] = text_nodes_paths
     
     dict_entries = ["WISHLIST"]
     
     wishregex = regexofdictentries(dict_entries, lang)
         
     '''
     CORE
     '''          
     counter1 = 0
     for text in a_text_nodes:
         #text = str(text)
         if re.search(wishregex,text):
             counter1 += 1
     '''
     '''  
             
     self.features = [counter1]
     
     if counter1 > 1:
         result = self.certainty = 1
     elif counter1 == 1:
         result = self.certainty = 0.5
     else:
         result = self.certainty = 0
     
     return result
 def find(self,params):
     '''
     Look for the shipping and returns information
     ''' 
     result = None
     
     tree = self.page_dict.get("tree")
     lang = self.page_dict.get("lang")
     text_nodes = []
     
     # get a list of all nonempty strings from body (only children of anchors)
     if "text_nodes" in self.page_dict:
         text_nodes = self.page_dict["text_nodes"]
         text_nodes_paths = self.page_dict["text_nodes_paths"]
     else:  
         text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a")
         self.page_dict["text_nodes"] = text_nodes
         self.page_dict["text_nodes_paths"] = text_nodes_paths
     
     dict_entries = ["SHIPPING","RETURNS"]
     
     shipregex = regexofdictentries(dict_entries, lang)
         
     '''
     CORE
     '''          
     counter1 = 0
     for text in text_nodes:
         #text = str(text)
         if re.search(shipregex,text):
             counter1 += 1
     '''
     '''  
             
     self.features = [counter1]
     
     if counter1 > 3:
         result = self.certainty = 1
     elif counter1 > 1:
         result = self.certainty = 0.5
     else:
         result = self.certainty = 0
     
     return result
    def find(self,params):
        
        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
    
        # 1. Find all with attributes
        nodes = self.__nodes_containing_attrs(elem='*', lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"])     
        #print("---")
        
        baglink_candidates_list = []
        for node in nodes:
            cand = BagLinkCandidate()
            cand.node = node
            cand.nodepath = tree.getpath(node)
            cand.points = 0
            baglink_candidates_list.append(cand)
        
        #print(len(baglink_candidates_list))
        # 2a. Remove ones that have more than 3 levels of children
        baglink_candidates_list = [cand for cand in baglink_candidates_list if not tree.xpath(cand.nodepath+"/*/*/*/*/*")]         
        #print(len(baglink_candidates_list))   
        # 2b. Remove ones that have more than 20 descendants
        baglink_candidates_list = [cand for cand in baglink_candidates_list if tree.xpath("count("+cand.nodepath+"//*)")<20]         
        #print(len(baglink_candidates_list))
        # 2c. Remove the ones that look like add-to-cart buttons
        addclassif = AddToBasketButtonClassifier()
        baglink_candidates_list = [cand for cand in baglink_candidates_list if not addclassif.classifygivennode(self.page_dict, cand.nodepath)] 
        # 2d. Remove the ones that have more than 20 words in text nodes inside
        baglink_candidates_list = [cand for cand in baglink_candidates_list if sum(len(text.split()) for text in tree.xpath(cand.nodepath+"//text()"))<10]         
        #print(len(baglink_candidates_list))  
        # 2e. Remove ones that are children of others found
        baglink_candidates_list = [cand for cand in baglink_candidates_list if self.__isbagroot(cand,baglink_candidates_list)] 
        #print(len(baglink_candidates_list))  
        #print(' '.join(x.node.tag for x in baglink_candidates_list))
        
        for cand in baglink_candidates_list:
            
            # 3. If element is anchor, add points
            if cand.node.tag == 'a':
                cand.points += points[0]
            # 4. If element has anchor descendants, add points
            xpath = cand.nodepath+"//"+"a"
            if tree.xpath(xpath,namespaces=namespaces):
                cand.points += points[1]
            # 5. If element is image descendants, add points
            xpath = cand.nodepath+"//"+"img"
            if tree.xpath(xpath,namespaces=namespaces):
                cand.points += points[2]
            # 6. If element has descendants with attrs, add points
            if self.__nodes_containing_attrs(root=cand.nodepath, elem="*", lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]):
                cand.points += points[3]
            # 7. If element has anchor descendants with attrs, add points
            if self.__nodes_containing_attrs(root=cand.nodepath, elem="a", lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]):
                cand.points += points[4]
            
#             if not attention_flag:  
#                 # 8. If element has  descendants with text type 1, add points
#                 xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", 
#                                                      lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]) 
#                 if tree.xpath(xpath,namespaces=namespaces):
#                     cand.points += points[5]                
#                 # 9. If element has  descendants with text type 2, add points
#                 xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", 
#                                                      lang=lang, dict_entries=["ECOM_ITEM"]) 
#                 if tree.xpath(xpath,namespaces=namespaces):
#                     cand.points += points[6]
#                 # 9. If element has  descendants with text type 3, add points
#                 xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", 
#                                                      lang=lang, regexs=["\d+"]) 
#                 if tree.xpath(xpath,namespaces=namespaces):
#                     cand.points += points[7]
#             else:
            text_nodes = tree.xpath(cand.nodepath+"//text()")
            text_nodes = [text.strip() for text in text_nodes if text.strip()]
            
            match1 = match2 = match3 = 0
            regex1 = regexofdictentries(entries=["ECOM_CART","ECOM_CHECKOUT"],lang=lang)
            regex2 = regexofdictentries(entries=["ECOM_ITEM"],lang=lang)
            regex3 = "\d+"
            for text in text_nodes:
                #text = str(text)
                # 8. If element has  descendants with text type 1, add points
                if match1==0 and re.search(regex1,text):
                    match1 = 1
                # 9. If element has  descendants with text type 2, add points
                if match2==0 and re.search(regex2,text):
                    match2 = 1
                # 10. If element has  descendants with text type 3, add points
                if match3==0 and re.search(regex3,text):
                    match3 = 1
            cand.points += match1 + match2 + match3
                
        if len(baglink_candidates_list) > 0:
            baglink_candidates_list = sorted(baglink_candidates_list, key=lambda x: -x.points)
            
            cand = baglink_candidates_list[0]
    #         print("\n--\n\n")
    #         print(i,cand.points,":::")
    #         prettyprint.print_html(cand.node)
            
            self.features = [cand.points]
            self.nodepath = cand.nodepath if cand.points>=2 else None
            self.certainty = 1.0 if cand.points>5 else cand.points/5.0  
            
        else:
            self.features = [0]   
        
        return self.nodepath
    def find(self, params):

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")

        # 1. Find all with attributes
        nodes = self.__nodes_containing_attrs(
            elem='*', lang=lang, dict_entries=["ECOM_CART", "ECOM_CHECKOUT"])
        #print("---")

        baglink_candidates_list = []
        for node in nodes:
            cand = BagLinkCandidate()
            cand.node = node
            cand.nodepath = tree.getpath(node)
            cand.points = 0
            baglink_candidates_list.append(cand)

        #print(len(baglink_candidates_list))
        # 2a. Remove ones that have more than 3 levels of children
        baglink_candidates_list = [
            cand for cand in baglink_candidates_list
            if not tree.xpath(cand.nodepath + "/*/*/*/*/*")
        ]
        #print(len(baglink_candidates_list))
        # 2b. Remove ones that have more than 20 descendants
        baglink_candidates_list = [
            cand for cand in baglink_candidates_list
            if tree.xpath("count(" + cand.nodepath + "//*)") < 20
        ]
        #print(len(baglink_candidates_list))
        # 2c. Remove the ones that look like add-to-cart buttons
        addclassif = AddToBasketButtonClassifier()
        baglink_candidates_list = [
            cand for cand in baglink_candidates_list
            if not addclassif.classifygivennode(self.page_dict, cand.nodepath)
        ]
        # 2d. Remove the ones that have more than 20 words in text nodes inside
        baglink_candidates_list = [
            cand for cand in baglink_candidates_list if sum(
                len(text.split())
                for text in tree.xpath(cand.nodepath + "//text()")) < 10
        ]
        #print(len(baglink_candidates_list))
        # 2e. Remove ones that are children of others found
        baglink_candidates_list = [
            cand for cand in baglink_candidates_list
            if self.__isbagroot(cand, baglink_candidates_list)
        ]
        #print(len(baglink_candidates_list))
        #print(' '.join(x.node.tag for x in baglink_candidates_list))

        for cand in baglink_candidates_list:

            # 3. If element is anchor, add points
            if cand.node.tag == 'a':
                cand.points += points[0]
            # 4. If element has anchor descendants, add points
            xpath = cand.nodepath + "//" + "a"
            if tree.xpath(xpath, namespaces=namespaces):
                cand.points += points[1]
            # 5. If element is image descendants, add points
            xpath = cand.nodepath + "//" + "img"
            if tree.xpath(xpath, namespaces=namespaces):
                cand.points += points[2]
            # 6. If element has descendants with attrs, add points
            if self.__nodes_containing_attrs(
                    root=cand.nodepath,
                    elem="*",
                    lang=lang,
                    dict_entries=["ECOM_CART", "ECOM_CHECKOUT"]):
                cand.points += points[3]
            # 7. If element has anchor descendants with attrs, add points
            if self.__nodes_containing_attrs(
                    root=cand.nodepath,
                    elem="a",
                    lang=lang,
                    dict_entries=["ECOM_CART", "ECOM_CHECKOUT"]):
                cand.points += points[4]

#             if not attention_flag:
#                 # 8. If element has  descendants with text type 1, add points
#                 xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()",
#                                                      lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"])
#                 if tree.xpath(xpath,namespaces=namespaces):
#                     cand.points += points[5]
#                 # 9. If element has  descendants with text type 2, add points
#                 xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()",
#                                                      lang=lang, dict_entries=["ECOM_ITEM"])
#                 if tree.xpath(xpath,namespaces=namespaces):
#                     cand.points += points[6]
#                 # 9. If element has  descendants with text type 3, add points
#                 xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()",
#                                                      lang=lang, regexs=["\d+"])
#                 if tree.xpath(xpath,namespaces=namespaces):
#                     cand.points += points[7]
#             else:
            text_nodes = tree.xpath(cand.nodepath + "//text()")
            text_nodes = [text.strip() for text in text_nodes if text.strip()]

            match1 = match2 = match3 = 0
            regex1 = regexofdictentries(entries=["ECOM_CART", "ECOM_CHECKOUT"],
                                        lang=lang)
            regex2 = regexofdictentries(entries=["ECOM_ITEM"], lang=lang)
            regex3 = "\d+"
            for text in text_nodes:
                #text = str(text)
                # 8. If element has  descendants with text type 1, add points
                if match1 == 0 and re.search(regex1, text):
                    match1 = 1
                # 9. If element has  descendants with text type 2, add points
                if match2 == 0 and re.search(regex2, text):
                    match2 = 1
                # 10. If element has  descendants with text type 3, add points
                if match3 == 0 and re.search(regex3, text):
                    match3 = 1
            cand.points += match1 + match2 + match3

        if len(baglink_candidates_list) > 0:
            baglink_candidates_list = sorted(baglink_candidates_list,
                                             key=lambda x: -x.points)

            cand = baglink_candidates_list[0]
            #         print("\n--\n\n")
            #         print(i,cand.points,":::")
            #         prettyprint.print_html(cand.node)

            self.features = [cand.points]
            self.nodepath = cand.nodepath if cand.points >= 2 else None
            self.certainty = 1.0 if cand.points > 5 else cand.points / 5.0

        else:
            self.features = [0]

        return self.nodepath