def __init__(self, contents_list, namenode_list, buttonnode_list=None, basketnode_list=None): ''' contents_list - content of n pages of the site offering product ''' self.__contents_list = contents_list self.__trees_list = trees_list = [ xpathops.content2tree(content) for content in contents_list ] self.__namenode_list = namenode_list self.__buttonnode_list = buttonnode_list self.__basketnode_list = basketnode_list self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist( trees_list[0]) self.__text_nodes_list = [] self.__text_nodes_paths_list = [] for tree in trees_list: text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree) self.__text_nodes_list.append(text_nodes) self.__text_nodes_paths_list.append(text_nodes_paths)
def find(self,params): ''' Should get a list of unique words contained in a part of the title that is non-repeating ''' self.__title_words = [] for word in params: word = ''.join([c for c in word if (c.isalnum() or c==' ')]) self.__title_words.append(word) tree = self.tree = self.page_dict.get("tree") # get a list of all nonempty strings from body (except scripts) if self.page_dict.get("text_nodes"): text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths title_words = [] for word in params: word = ''.join([c for c in word if (c.isalnum())]) title_words.append(word) candidates = [] for i in range(0,len(text_nodes)): node_path = text_nodes_paths[i] if not self.__is_anchor_descendant(node_path): node = text_nodes[i] node = node.lower() node = ''.join([c for c in node if (c.isalnum() or c==' ')]) node_words = node.split() if any(word in node_words for word in title_words): #print(node,node_path) # count the number of present words coef3 = self.__count_title_words_coef(node_words) # check if one of ancestors is <h1> node coef1 = self.__get_h1_ancestor_coef(node_path) # check if one of ancestors is <div> node with defined attribute values coef2 = self.__get_div_ancestor_attrs_coef(node_path) candidate_tuple = NameClassifCandTuple(coef1, coef2, coef3, node_path, node) candidates.append(candidate_tuple) if candidates: candidates = sorted(candidates,key=lambda x: -(x.h1_child_coef+x.div_child_coef+x.title_words_coef)) print(candidates) solver = candidates[0] self.nodepath = self.__check_solver_surroundings(solver,candidates,title_words) self.nodepath = sorted(self.nodepath) return True else: self.nodepath = None return False
def __init__(self, contents_list, namenode_list, buttonnode_list=None, basketnode_list=None): ''' contents_list - content of n pages of the site offering product ''' self.__contents_list = contents_list self.__trees_list = trees_list = [xpathops.content2tree(content) for content in contents_list] self.__namenode_list = namenode_list self.__buttonnode_list = buttonnode_list self.__basketnode_list = basketnode_list self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist(trees_list[0]) self.__text_nodes_list = [] self.__text_nodes_paths_list = [] for tree in trees_list: text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree) self.__text_nodes_list.append(text_nodes) self.__text_nodes_paths_list.append(text_nodes_paths)
def find(self,params): ''' Look for the word "price" in the text and attributes values ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] attr_vals = [] # get a list of all nonempty strings from body (except scripts) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths # get a list of all attribute values from the body (except scripts) if "attr_vals" in self.page_dict: attr_vals = self.page_dict["attr_vals"] else: attr_vals = tree2attributevalslist(tree) self.page_dict["attr_vals"] = attr_vals dict_entries = ["PRICE"] priceregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = counter2 = 0 for text in text_nodes: #text = str(text) if re.search(priceregex,text): counter1 += 1 for val in attr_vals: #val = str(val) if re.search(priceregex,val): counter2 += 1 ''' ''' self.features = [counter1,counter2] if counter1 + counter2 > 10: result = self.certainty = 1 else: result = self.certainty = 0 return result
def find(self, params): ''' Look for the word "price" in the text and attributes values ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] attr_vals = [] # get a list of all nonempty strings from body (except scripts) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths # get a list of all attribute values from the body (except scripts) if "attr_vals" in self.page_dict: attr_vals = self.page_dict["attr_vals"] else: attr_vals = tree2attributevalslist(tree) self.page_dict["attr_vals"] = attr_vals dict_entries = ["PRICE"] priceregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = counter2 = 0 for text in text_nodes: #text = str(text) if re.search(priceregex, text): counter1 += 1 for val in attr_vals: #val = str(val) if re.search(priceregex, val): counter2 += 1 ''' ''' self.features = [counter1, counter2] if counter1 + counter2 > 10: result = self.certainty = 1 else: result = self.certainty = 0 return result
def find(self, params): ''' Look for the word "wishlist" in the anchors ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") a_text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "a_text_nodes" in self.page_dict: a_text_nodes = self.page_dict["a_text_nodes"] text_nodes_paths = self.page_dict["a_text_nodes_paths"] else: a_text_nodes, text_nodes_paths = tree2textnodeslist(tree, element="a") self.page_dict["a_text_nodes"] = a_text_nodes self.page_dict["a_text_nodes_paths"] = text_nodes_paths dict_entries = ["WISHLIST"] wishregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in a_text_nodes: #text = str(text) if re.search(wishregex, text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 1: result = self.certainty = 1 elif counter1 == 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self, params): ''' Look for the shipping and returns information ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree, element="a") self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths dict_entries = ["SHIPPING", "RETURNS"] shipregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in text_nodes: #text = str(text) if re.search(shipregex, text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 3: result = self.certainty = 1 elif counter1 > 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self,params): ''' Look for the word "wishlist" in the anchors ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") a_text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "a_text_nodes" in self.page_dict: a_text_nodes = self.page_dict["a_text_nodes"] text_nodes_paths = self.page_dict["a_text_nodes_paths"] else: a_text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a") self.page_dict["a_text_nodes"] = a_text_nodes self.page_dict["a_text_nodes_paths"] = text_nodes_paths dict_entries = ["WISHLIST"] wishregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in a_text_nodes: #text = str(text) if re.search(wishregex,text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 1: result = self.certainty = 1 elif counter1 == 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self,params): ''' Look for the shipping and returns information ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a") self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths dict_entries = ["SHIPPING","RETURNS"] shipregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in text_nodes: #text = str(text) if re.search(shipregex,text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 3: result = self.certainty = 1 elif counter1 > 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self, params): ''' Look for the information on taxes ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths price_any_regex pricecurr_any_regex currprice_any_regex ''' CORE ''' page_text = ' '.join(text for text in text_nodes) pricecurrcount = countnonoverlappingmatches(pricecurr_any_regex, page_text) currpricecount = countnonoverlappingmatches(currprice_any_regex, page_text) pricecount = countnonoverlappingmatches(price_any_regex, page_text) ''' ''' result = self.features = [pricecurrcount + currpricecount, pricecount] return result
def find(self,params): ''' Look for the information on taxes ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths price_any_regex pricecurr_any_regex currprice_any_regex ''' CORE ''' page_text = ' '.join(text for text in text_nodes) pricecurrcount = countnonoverlappingmatches(pricecurr_any_regex, page_text) currpricecount = countnonoverlappingmatches(currprice_any_regex, page_text) pricecount = countnonoverlappingmatches(price_any_regex, page_text) ''' ''' result = self.features = [pricecurrcount+currpricecount,pricecount] return result
def find(self, params): ''' Should get a list of unique words contained in a part of the title that is non-repeating ''' self.__title_words = [] for word in params: word = ''.join([c for c in word if (c.isalnum() or c == ' ')]) self.__title_words.append(word) tree = self.tree = self.page_dict.get("tree") # get a list of all nonempty strings from body (except scripts) if self.page_dict.get("text_nodes"): text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths title_words = [] for word in params: word = ''.join([c for c in word if (c.isalnum())]) title_words.append(word) candidates = [] for i in range(0, len(text_nodes)): node_path = text_nodes_paths[i] if not self.__is_anchor_descendant(node_path): node = text_nodes[i] node = node.lower() node = ''.join([c for c in node if (c.isalnum() or c == ' ')]) node_words = node.split() if any(word in node_words for word in title_words): #print(node,node_path) # count the number of present words coef3 = self.__count_title_words_coef(node_words) # check if one of ancestors is <h1> node coef1 = self.__get_h1_ancestor_coef(node_path) # check if one of ancestors is <div> node with defined attribute values coef2 = self.__get_div_ancestor_attrs_coef(node_path) candidate_tuple = NameClassifCandTuple( coef1, coef2, coef3, node_path, node) candidates.append(candidate_tuple) if candidates: candidates = sorted( candidates, key=lambda x: -(x.h1_child_coef + x.div_child_coef + x. title_words_coef)) print(candidates) solver = candidates[0] self.nodepath = self.__check_solver_surroundings( solver, candidates, title_words) self.nodepath = sorted(self.nodepath) return True else: self.nodepath = None return False
def run(self): tree = self.__tree # take the title title = tree.xpath('/html/head/title/text()')[0].strip() print(title) # split the title into words title_words_orig = title.split() title_words = [] title_words.extend(x.lower() for x in title_words_orig[:]) print(title_words) # TODO: remove the common part of a title for the domain pass # TODO maybe: boilerplate recognition ;) pass # get a list of all nonempty strings from body (except scripts) text_nodes, text_nodes_paths = tree2textnodeslist(tree) word_nodes = [] word_nodes_parent_type = [] word_nodes_paths = [] for i in range(0,len(text_nodes)-1): x = text_nodes[i].strip() #y = nltk.word_tokenize(x) y = x.split() word_nodes.extend(y) text_nodes_paths[i] = fix_xpath_errors(text_nodes_paths[i]) isparentlink = is_link(text_nodes_paths[i]) for j in range(0,len(y)): word_nodes_paths.append(text_nodes_paths[i]) word_nodes_parent_type.append(isparentlink) self.__text_nodes = text_nodes self.__text_nodes_paths = text_nodes_paths # find the sequence that at most probably is the name and the price firstlastword = self.__find_name_price_sequence(word_nodes, title_words, word_nodes_parent_type) # find nodes that contain the found sequence namepricenodes = [] namepricenodesstrings = [] for i in range(firstlastword[0],firstlastword[1]): namepricenodes.append(word_nodes_paths[i]) #print(word_nodes_paths[i], word_nodes[i]) # for i in range(firstlastword[0]-10,firstlastword[1]+10): # print(word_nodes[i],word_values[i]) namepricenodes = list(set(namepricenodes)) for x in namepricenodes: print(x) namepricenodesstrings.append(tree.xpath(x+'/text()')[0].strip()) print(x, namepricenodesstrings[-1]) namenodes = [] pricenodes = [] pricesfound = 0 i = 0 # pair the information with the node for i in range(0,len(namepricenodes)): node = namepricenodesstrings[i] nodewords = node.split() if (set(nodewords) & set(title_words_orig)): namenodes.append(namepricenodes[i]) print('product name (part):',node) else: if (set(nodewords) & set(__currency_regex_symbols)): print('currency:',node) pricenodes.append(namepricenodes[i]) for x in nodewords: if(is_number(x)): print('price:',node) pricenodes.append(namepricenodes[i]) pricesfound += 1 break if re.match(pricecurr_regex,x) or re.match(currprice_regex,x): print('currency & price:',node) pricenodes.append(namepricenodes[i]) pricesfound += 1 # If the one price is found, take a look if it's not followed by some other prices if pricesfound==1: other_pricecurrnodes = self.__find_prices(2, i, MAX_PRICE_PRICE_DISTANCE) if other_pricecurrnodes[0]: pricenodes.extend(other_pricecurrnodes[0]) pricesfound += other_pricecurrnodes[1] # If there are a few price nodes, choose the one with the lowest price as the correct one! if pricesfound>1: pricenodes = self.__find_cheapest_price(pricenodes) print('product name:' + str(namenodes)) print('product price:' + str(pricenodes)) # if pricenode list empty - look for it somewhere further if not pricenodes: # = is empty? pricenodes = self.__find_price_further(namenodes[0]) self.namenodes = namenodes self.pricenodes = pricenodes return [namenodes,pricenodes]
def run(self): tree = self.__tree # take the title title = tree.xpath('/html/head/title/text()')[0].strip() print(title) # split the title into words title_words_orig = title.split() title_words = [] title_words.extend(x.lower() for x in title_words_orig[:]) print(title_words) # TODO: remove the common part of a title for the domain pass # TODO maybe: boilerplate recognition ;) pass # get a list of all nonempty strings from body (except scripts) text_nodes, text_nodes_paths = tree2textnodeslist(tree) word_nodes = [] word_nodes_parent_type = [] word_nodes_paths = [] for i in range(0, len(text_nodes) - 1): x = text_nodes[i].strip() #y = nltk.word_tokenize(x) y = x.split() word_nodes.extend(y) text_nodes_paths[i] = fix_xpath_errors(text_nodes_paths[i]) isparentlink = is_link(text_nodes_paths[i]) for j in range(0, len(y)): word_nodes_paths.append(text_nodes_paths[i]) word_nodes_parent_type.append(isparentlink) self.__text_nodes = text_nodes self.__text_nodes_paths = text_nodes_paths # find the sequence that at most probably is the name and the price firstlastword = self.__find_name_price_sequence( word_nodes, title_words, word_nodes_parent_type) # find nodes that contain the found sequence namepricenodes = [] namepricenodesstrings = [] for i in range(firstlastword[0], firstlastword[1]): namepricenodes.append(word_nodes_paths[i]) #print(word_nodes_paths[i], word_nodes[i]) # for i in range(firstlastword[0]-10,firstlastword[1]+10): # print(word_nodes[i],word_values[i]) namepricenodes = list(set(namepricenodes)) for x in namepricenodes: print(x) namepricenodesstrings.append(tree.xpath(x + '/text()')[0].strip()) print(x, namepricenodesstrings[-1]) namenodes = [] pricenodes = [] pricesfound = 0 i = 0 # pair the information with the node for i in range(0, len(namepricenodes)): node = namepricenodesstrings[i] nodewords = node.split() if (set(nodewords) & set(title_words_orig)): namenodes.append(namepricenodes[i]) print('product name (part):', node) else: if (set(nodewords) & set(__currency_regex_symbols)): print('currency:', node) pricenodes.append(namepricenodes[i]) for x in nodewords: if (is_number(x)): print('price:', node) pricenodes.append(namepricenodes[i]) pricesfound += 1 break if re.match(pricecurr_regex, x) or re.match( currprice_regex, x): print('currency & price:', node) pricenodes.append(namepricenodes[i]) pricesfound += 1 # If the one price is found, take a look if it's not followed by some other prices if pricesfound == 1: other_pricecurrnodes = self.__find_prices( 2, i, MAX_PRICE_PRICE_DISTANCE) if other_pricecurrnodes[0]: pricenodes.extend(other_pricecurrnodes[0]) pricesfound += other_pricecurrnodes[1] # If there are a few price nodes, choose the one with the lowest price as the correct one! if pricesfound > 1: pricenodes = self.__find_cheapest_price(pricenodes) print('product name:' + str(namenodes)) print('product price:' + str(pricenodes)) # if pricenode list empty - look for it somewhere further if not pricenodes: # = is empty? pricenodes = self.__find_price_further(namenodes[0]) self.namenodes = namenodes self.pricenodes = pricenodes return [namenodes, pricenodes]