示例#1
0
    def __init__(self,
                 contents_list,
                 namenode_list,
                 buttonnode_list=None,
                 basketnode_list=None):
        '''
        contents_list - content of n pages of the site offering product
        '''
        self.__contents_list = contents_list
        self.__trees_list = trees_list = [
            xpathops.content2tree(content) for content in contents_list
        ]

        self.__namenode_list = namenode_list
        self.__buttonnode_list = buttonnode_list
        self.__basketnode_list = basketnode_list

        self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist(
            trees_list[0])

        self.__text_nodes_list = []
        self.__text_nodes_paths_list = []
        for tree in trees_list:
            text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree)
            self.__text_nodes_list.append(text_nodes)
            self.__text_nodes_paths_list.append(text_nodes_paths)
 def find(self,params):
     ''' Should get a list of unique words contained in a part of the title that is non-repeating '''
     self.__title_words = []
     for word in params:
         word = ''.join([c for c in word if (c.isalnum() or c==' ')])
         self.__title_words.append(word)
     
     tree = self.tree = self.page_dict.get("tree")
     # get a list of all nonempty strings from body (except scripts)
     if self.page_dict.get("text_nodes"):
         text_nodes = self.page_dict["text_nodes"]
         text_nodes_paths = self.page_dict["text_nodes_paths"]
     else:  
         text_nodes, text_nodes_paths = tree2textnodeslist(tree)
         self.page_dict["text_nodes"] = text_nodes
         self.page_dict["text_nodes_paths"] = text_nodes_paths
     
     title_words = []
     for word in params:
         word = ''.join([c for c in word if (c.isalnum())])
         title_words.append(word)
     
     candidates = []
     
     for i in range(0,len(text_nodes)):
         node_path = text_nodes_paths[i]
         if not self.__is_anchor_descendant(node_path):
             node = text_nodes[i]
             node = node.lower()
             node = ''.join([c for c in node if (c.isalnum() or c==' ')])
             node_words = node.split()
             if any(word in node_words for word in title_words):
                 #print(node,node_path)
                 # count the number of present words
                 coef3 = self.__count_title_words_coef(node_words)
                 # check if one of ancestors is <h1> node
                 coef1 = self.__get_h1_ancestor_coef(node_path)
                 # check if one of ancestors is <div> node with  defined attribute values
                 coef2 = self.__get_div_ancestor_attrs_coef(node_path)
                 
                 candidate_tuple = NameClassifCandTuple(coef1, coef2, coef3, node_path, node)
                 candidates.append(candidate_tuple)
             
             
     if candidates:
         candidates = sorted(candidates,key=lambda x: -(x.h1_child_coef+x.div_child_coef+x.title_words_coef))
         print(candidates)
         
         solver = candidates[0]
         
         self.nodepath = self.__check_solver_surroundings(solver,candidates,title_words)
         self.nodepath = sorted(self.nodepath)
         
         return True
     else:
         self.nodepath = None
         return False
 def __init__(self, contents_list, namenode_list, buttonnode_list=None, basketnode_list=None):
     '''
     contents_list - content of n pages of the site offering product
     '''
     self.__contents_list = contents_list
     self.__trees_list = trees_list = [xpathops.content2tree(content) for content in contents_list]
     
     self.__namenode_list = namenode_list
     self.__buttonnode_list = buttonnode_list
     self.__basketnode_list = basketnode_list
             
     self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist(trees_list[0])
     
     self.__text_nodes_list = []
     self.__text_nodes_paths_list = []
     for tree in trees_list:
         text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree)
         self.__text_nodes_list.append(text_nodes)
         self.__text_nodes_paths_list.append(text_nodes_paths)
 def find(self,params):
     '''
     Look for the word "price" in the text and attributes values
     ''' 
     result = None
     
     tree = self.page_dict.get("tree")
     lang = self.page_dict.get("lang")
     text_nodes = []
     attr_vals = []    
     
     # get a list of all nonempty strings from body (except scripts)
     if "text_nodes" in self.page_dict:
         text_nodes = self.page_dict["text_nodes"]
         text_nodes_paths = self.page_dict["text_nodes_paths"]
     else:  
         text_nodes, text_nodes_paths = tree2textnodeslist(tree)
         self.page_dict["text_nodes"] = text_nodes
         self.page_dict["text_nodes_paths"] = text_nodes_paths
         
     # get a list of all attribute values from the body (except scripts)
     if "attr_vals" in self.page_dict:
         attr_vals = self.page_dict["attr_vals"]
     else:  
         attr_vals = tree2attributevalslist(tree)
         self.page_dict["attr_vals"] = attr_vals
     
     dict_entries = ["PRICE"]
     
     priceregex = regexofdictentries(dict_entries, lang)
         
     '''
     CORE
     '''          
     counter1 = counter2 = 0
     for text in text_nodes:
         #text = str(text)
         if re.search(priceregex,text):
             counter1 += 1
     for val in attr_vals:
         #val = str(val)
         if re.search(priceregex,val):
             counter2 += 1   
     '''
     '''  
             
     self.features = [counter1,counter2]
     
     if counter1 + counter2 > 10:
         result = self.certainty = 1
     else:
         result = self.certainty = 0
     
     return result
示例#5
0
    def find(self, params):
        '''
        Look for the word "price" in the text and attributes values
        '''
        result = None

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
        text_nodes = []
        attr_vals = []

        # get a list of all nonempty strings from body (except scripts)
        if "text_nodes" in self.page_dict:
            text_nodes = self.page_dict["text_nodes"]
            text_nodes_paths = self.page_dict["text_nodes_paths"]
        else:
            text_nodes, text_nodes_paths = tree2textnodeslist(tree)
            self.page_dict["text_nodes"] = text_nodes
            self.page_dict["text_nodes_paths"] = text_nodes_paths

        # get a list of all attribute values from the body (except scripts)
        if "attr_vals" in self.page_dict:
            attr_vals = self.page_dict["attr_vals"]
        else:
            attr_vals = tree2attributevalslist(tree)
            self.page_dict["attr_vals"] = attr_vals

        dict_entries = ["PRICE"]

        priceregex = regexofdictentries(dict_entries, lang)
        '''
        CORE
        '''
        counter1 = counter2 = 0
        for text in text_nodes:
            #text = str(text)
            if re.search(priceregex, text):
                counter1 += 1
        for val in attr_vals:
            #val = str(val)
            if re.search(priceregex, val):
                counter2 += 1
        '''
        '''

        self.features = [counter1, counter2]

        if counter1 + counter2 > 10:
            result = self.certainty = 1
        else:
            result = self.certainty = 0

        return result
示例#6
0
    def find(self, params):
        '''
        Look for the word "wishlist" in the anchors
        '''
        result = None

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
        a_text_nodes = []

        # get a list of all nonempty strings from body (only children of anchors)
        if "a_text_nodes" in self.page_dict:
            a_text_nodes = self.page_dict["a_text_nodes"]
            text_nodes_paths = self.page_dict["a_text_nodes_paths"]
        else:
            a_text_nodes, text_nodes_paths = tree2textnodeslist(tree,
                                                                element="a")
            self.page_dict["a_text_nodes"] = a_text_nodes
            self.page_dict["a_text_nodes_paths"] = text_nodes_paths

        dict_entries = ["WISHLIST"]

        wishregex = regexofdictentries(dict_entries, lang)
        '''
        CORE
        '''
        counter1 = 0
        for text in a_text_nodes:
            #text = str(text)
            if re.search(wishregex, text):
                counter1 += 1
        '''
        '''

        self.features = [counter1]

        if counter1 > 1:
            result = self.certainty = 1
        elif counter1 == 1:
            result = self.certainty = 0.5
        else:
            result = self.certainty = 0

        return result
示例#7
0
    def find(self, params):
        '''
        Look for the shipping and returns information
        '''
        result = None

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
        text_nodes = []

        # get a list of all nonempty strings from body (only children of anchors)
        if "text_nodes" in self.page_dict:
            text_nodes = self.page_dict["text_nodes"]
            text_nodes_paths = self.page_dict["text_nodes_paths"]
        else:
            text_nodes, text_nodes_paths = tree2textnodeslist(tree,
                                                              element="a")
            self.page_dict["text_nodes"] = text_nodes
            self.page_dict["text_nodes_paths"] = text_nodes_paths

        dict_entries = ["SHIPPING", "RETURNS"]

        shipregex = regexofdictentries(dict_entries, lang)
        '''
        CORE
        '''
        counter1 = 0
        for text in text_nodes:
            #text = str(text)
            if re.search(shipregex, text):
                counter1 += 1
        '''
        '''

        self.features = [counter1]

        if counter1 > 3:
            result = self.certainty = 1
        elif counter1 > 1:
            result = self.certainty = 0.5
        else:
            result = self.certainty = 0

        return result
 def find(self,params):
     '''
     Look for the word "wishlist" in the anchors
     ''' 
     result = None
     
     tree = self.page_dict.get("tree")
     lang = self.page_dict.get("lang")
     a_text_nodes = []
     
     # get a list of all nonempty strings from body (only children of anchors)
     if "a_text_nodes" in self.page_dict:
         a_text_nodes = self.page_dict["a_text_nodes"]
         text_nodes_paths = self.page_dict["a_text_nodes_paths"]
     else:  
         a_text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a")
         self.page_dict["a_text_nodes"] = a_text_nodes
         self.page_dict["a_text_nodes_paths"] = text_nodes_paths
     
     dict_entries = ["WISHLIST"]
     
     wishregex = regexofdictentries(dict_entries, lang)
         
     '''
     CORE
     '''          
     counter1 = 0
     for text in a_text_nodes:
         #text = str(text)
         if re.search(wishregex,text):
             counter1 += 1
     '''
     '''  
             
     self.features = [counter1]
     
     if counter1 > 1:
         result = self.certainty = 1
     elif counter1 == 1:
         result = self.certainty = 0.5
     else:
         result = self.certainty = 0
     
     return result
 def find(self,params):
     '''
     Look for the shipping and returns information
     ''' 
     result = None
     
     tree = self.page_dict.get("tree")
     lang = self.page_dict.get("lang")
     text_nodes = []
     
     # get a list of all nonempty strings from body (only children of anchors)
     if "text_nodes" in self.page_dict:
         text_nodes = self.page_dict["text_nodes"]
         text_nodes_paths = self.page_dict["text_nodes_paths"]
     else:  
         text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a")
         self.page_dict["text_nodes"] = text_nodes
         self.page_dict["text_nodes_paths"] = text_nodes_paths
     
     dict_entries = ["SHIPPING","RETURNS"]
     
     shipregex = regexofdictentries(dict_entries, lang)
         
     '''
     CORE
     '''          
     counter1 = 0
     for text in text_nodes:
         #text = str(text)
         if re.search(shipregex,text):
             counter1 += 1
     '''
     '''  
             
     self.features = [counter1]
     
     if counter1 > 3:
         result = self.certainty = 1
     elif counter1 > 1:
         result = self.certainty = 0.5
     else:
         result = self.certainty = 0
     
     return result
示例#10
0
    def find(self, params):
        '''
        Look for the information on taxes
        '''
        result = None

        tree = self.page_dict.get("tree")
        lang = self.page_dict.get("lang")
        text_nodes = []

        # get a list of all nonempty strings from body (only children of anchors)
        if "text_nodes" in self.page_dict:
            text_nodes = self.page_dict["text_nodes"]
            text_nodes_paths = self.page_dict["text_nodes_paths"]
        else:
            text_nodes, text_nodes_paths = tree2textnodeslist(tree)
            self.page_dict["text_nodes"] = text_nodes
            self.page_dict["text_nodes_paths"] = text_nodes_paths

        price_any_regex
        pricecurr_any_regex
        currprice_any_regex
        '''
        CORE
        '''
        page_text = ' '.join(text for text in text_nodes)
        pricecurrcount = countnonoverlappingmatches(pricecurr_any_regex,
                                                    page_text)
        currpricecount = countnonoverlappingmatches(currprice_any_regex,
                                                    page_text)
        pricecount = countnonoverlappingmatches(price_any_regex, page_text)
        '''
        '''

        result = self.features = [pricecurrcount + currpricecount, pricecount]

        return result
 def find(self,params):
     '''
     Look for the information on taxes
     ''' 
     result = None
     
     tree = self.page_dict.get("tree")
     lang = self.page_dict.get("lang")
     text_nodes = []
     
     # get a list of all nonempty strings from body (only children of anchors)
     if "text_nodes" in self.page_dict:
         text_nodes = self.page_dict["text_nodes"]
         text_nodes_paths = self.page_dict["text_nodes_paths"]
     else:  
         text_nodes, text_nodes_paths = tree2textnodeslist(tree)
         self.page_dict["text_nodes"] = text_nodes
         self.page_dict["text_nodes_paths"] = text_nodes_paths
         
     price_any_regex
     pricecurr_any_regex
     currprice_any_regex
         
     '''
     CORE
     '''   
     page_text = ' '.join(text for text in text_nodes)
     pricecurrcount = countnonoverlappingmatches(pricecurr_any_regex, page_text)
     currpricecount = countnonoverlappingmatches(currprice_any_regex, page_text)
     pricecount     = countnonoverlappingmatches(price_any_regex, page_text)
     
     '''
     '''  
             
     result = self.features = [pricecurrcount+currpricecount,pricecount]
     
     return result    
    def find(self, params):
        ''' Should get a list of unique words contained in a part of the title that is non-repeating '''
        self.__title_words = []
        for word in params:
            word = ''.join([c for c in word if (c.isalnum() or c == ' ')])
            self.__title_words.append(word)

        tree = self.tree = self.page_dict.get("tree")
        # get a list of all nonempty strings from body (except scripts)
        if self.page_dict.get("text_nodes"):
            text_nodes = self.page_dict["text_nodes"]
            text_nodes_paths = self.page_dict["text_nodes_paths"]
        else:
            text_nodes, text_nodes_paths = tree2textnodeslist(tree)
            self.page_dict["text_nodes"] = text_nodes
            self.page_dict["text_nodes_paths"] = text_nodes_paths

        title_words = []
        for word in params:
            word = ''.join([c for c in word if (c.isalnum())])
            title_words.append(word)

        candidates = []

        for i in range(0, len(text_nodes)):
            node_path = text_nodes_paths[i]
            if not self.__is_anchor_descendant(node_path):
                node = text_nodes[i]
                node = node.lower()
                node = ''.join([c for c in node if (c.isalnum() or c == ' ')])
                node_words = node.split()
                if any(word in node_words for word in title_words):
                    #print(node,node_path)
                    # count the number of present words
                    coef3 = self.__count_title_words_coef(node_words)
                    # check if one of ancestors is <h1> node
                    coef1 = self.__get_h1_ancestor_coef(node_path)
                    # check if one of ancestors is <div> node with  defined attribute values
                    coef2 = self.__get_div_ancestor_attrs_coef(node_path)

                    candidate_tuple = NameClassifCandTuple(
                        coef1, coef2, coef3, node_path, node)
                    candidates.append(candidate_tuple)

        if candidates:
            candidates = sorted(
                candidates,
                key=lambda x: -(x.h1_child_coef + x.div_child_coef + x.
                                title_words_coef))
            print(candidates)

            solver = candidates[0]

            self.nodepath = self.__check_solver_surroundings(
                solver, candidates, title_words)
            self.nodepath = sorted(self.nodepath)

            return True
        else:
            self.nodepath = None
            return False
    def run(self):
        tree = self.__tree
        
        # take the title
        title = tree.xpath('/html/head/title/text()')[0].strip()
        print(title)
        
        # split the title into words
        title_words_orig = title.split()
        title_words = []
        title_words.extend(x.lower() for x in title_words_orig[:])
        print(title_words)
        
        # TODO: remove the common part of a title for the domain
        pass
        
        # TODO maybe: boilerplate recognition ;)
        pass
                
        # get a list of all nonempty strings from body (except scripts)
        text_nodes, text_nodes_paths = tree2textnodeslist(tree)
        
        word_nodes = []
        word_nodes_parent_type = []
        word_nodes_paths = []
        for i in range(0,len(text_nodes)-1):
            x = text_nodes[i].strip()
            #y = nltk.word_tokenize(x)
            y = x.split()
            word_nodes.extend(y)
            text_nodes_paths[i] = fix_xpath_errors(text_nodes_paths[i])
            isparentlink = is_link(text_nodes_paths[i])
            for j in range(0,len(y)):
                word_nodes_paths.append(text_nodes_paths[i])
                word_nodes_parent_type.append(isparentlink)
                
        self.__text_nodes = text_nodes
        self.__text_nodes_paths = text_nodes_paths
          
        # find the sequence that at most probably is the name and the price
        firstlastword = self.__find_name_price_sequence(word_nodes, title_words, word_nodes_parent_type)        
        
        # find nodes that contain the found sequence
        namepricenodes = []
        namepricenodesstrings = []
        for i in range(firstlastword[0],firstlastword[1]):
            namepricenodes.append(word_nodes_paths[i])
            #print(word_nodes_paths[i], word_nodes[i])
            
#         for i in range(firstlastword[0]-10,firstlastword[1]+10):
#             print(word_nodes[i],word_values[i])
            
        namepricenodes = list(set(namepricenodes))
        for x in namepricenodes:
            print(x)
            namepricenodesstrings.append(tree.xpath(x+'/text()')[0].strip())
            print(x, namepricenodesstrings[-1])
        
        namenodes = []
        pricenodes = []
        
        pricesfound = 0
        i = 0
        # pair the information with the node
        for i in range(0,len(namepricenodes)):
            node = namepricenodesstrings[i]
            nodewords = node.split()
            if (set(nodewords) & set(title_words_orig)):
                namenodes.append(namepricenodes[i])
                print('product name (part):',node)
            else:
                if (set(nodewords) & set(__currency_regex_symbols)):
                    print('currency:',node)
                    pricenodes.append(namepricenodes[i])
                for x in nodewords:
                    if(is_number(x)):
                        print('price:',node)
                        pricenodes.append(namepricenodes[i])
                        pricesfound += 1
                        break
                    if re.match(pricecurr_regex,x) or re.match(currprice_regex,x):
                        print('currency & price:',node)
                        pricenodes.append(namepricenodes[i]) 
                        pricesfound += 1
                        
        # If the one price is found, take a look if it's not followed by some other prices
        if pricesfound==1:  
            other_pricecurrnodes = self.__find_prices(2, i, MAX_PRICE_PRICE_DISTANCE)
            if other_pricecurrnodes[0]:
                pricenodes.extend(other_pricecurrnodes[0])
            pricesfound += other_pricecurrnodes[1]
                     
        # If there are a few price nodes, choose the one with the lowest price as the correct one!      
        if pricesfound>1:
            pricenodes = self.__find_cheapest_price(pricenodes)
            
        print('product name:' + str(namenodes))
        print('product price:' + str(pricenodes))
                    
        # if pricenode list empty - look for it somewhere further
        if not pricenodes: # = is empty?
            pricenodes = self.__find_price_further(namenodes[0])
        
        self.namenodes = namenodes
        self.pricenodes = pricenodes
        
        return [namenodes,pricenodes]
示例#14
0
    def run(self):
        tree = self.__tree

        # take the title
        title = tree.xpath('/html/head/title/text()')[0].strip()
        print(title)

        # split the title into words
        title_words_orig = title.split()
        title_words = []
        title_words.extend(x.lower() for x in title_words_orig[:])
        print(title_words)

        # TODO: remove the common part of a title for the domain
        pass

        # TODO maybe: boilerplate recognition ;)
        pass

        # get a list of all nonempty strings from body (except scripts)
        text_nodes, text_nodes_paths = tree2textnodeslist(tree)

        word_nodes = []
        word_nodes_parent_type = []
        word_nodes_paths = []
        for i in range(0, len(text_nodes) - 1):
            x = text_nodes[i].strip()
            #y = nltk.word_tokenize(x)
            y = x.split()
            word_nodes.extend(y)
            text_nodes_paths[i] = fix_xpath_errors(text_nodes_paths[i])
            isparentlink = is_link(text_nodes_paths[i])
            for j in range(0, len(y)):
                word_nodes_paths.append(text_nodes_paths[i])
                word_nodes_parent_type.append(isparentlink)

        self.__text_nodes = text_nodes
        self.__text_nodes_paths = text_nodes_paths

        # find the sequence that at most probably is the name and the price
        firstlastword = self.__find_name_price_sequence(
            word_nodes, title_words, word_nodes_parent_type)

        # find nodes that contain the found sequence
        namepricenodes = []
        namepricenodesstrings = []
        for i in range(firstlastword[0], firstlastword[1]):
            namepricenodes.append(word_nodes_paths[i])
            #print(word_nodes_paths[i], word_nodes[i])

#         for i in range(firstlastword[0]-10,firstlastword[1]+10):
#             print(word_nodes[i],word_values[i])

        namepricenodes = list(set(namepricenodes))
        for x in namepricenodes:
            print(x)
            namepricenodesstrings.append(tree.xpath(x + '/text()')[0].strip())
            print(x, namepricenodesstrings[-1])

        namenodes = []
        pricenodes = []

        pricesfound = 0
        i = 0
        # pair the information with the node
        for i in range(0, len(namepricenodes)):
            node = namepricenodesstrings[i]
            nodewords = node.split()
            if (set(nodewords) & set(title_words_orig)):
                namenodes.append(namepricenodes[i])
                print('product name (part):', node)
            else:
                if (set(nodewords) & set(__currency_regex_symbols)):
                    print('currency:', node)
                    pricenodes.append(namepricenodes[i])
                for x in nodewords:
                    if (is_number(x)):
                        print('price:', node)
                        pricenodes.append(namepricenodes[i])
                        pricesfound += 1
                        break
                    if re.match(pricecurr_regex, x) or re.match(
                            currprice_regex, x):
                        print('currency & price:', node)
                        pricenodes.append(namepricenodes[i])
                        pricesfound += 1

        # If the one price is found, take a look if it's not followed by some other prices
        if pricesfound == 1:
            other_pricecurrnodes = self.__find_prices(
                2, i, MAX_PRICE_PRICE_DISTANCE)
            if other_pricecurrnodes[0]:
                pricenodes.extend(other_pricecurrnodes[0])
            pricesfound += other_pricecurrnodes[1]

        # If there are a few price nodes, choose the one with the lowest price as the correct one!
        if pricesfound > 1:
            pricenodes = self.__find_cheapest_price(pricenodes)

        print('product name:' + str(namenodes))
        print('product price:' + str(pricenodes))

        # if pricenode list empty - look for it somewhere further
        if not pricenodes:  # = is empty?
            pricenodes = self.__find_price_further(namenodes[0])

        self.namenodes = namenodes
        self.pricenodes = pricenodes

        return [namenodes, pricenodes]