def execute(self, params): ''' Parameters: 1 page content 2 nameextractiontuples 3 pricextractiontuples Returns: name price_currency is_current_price is_lowest_price is_highest_price is_old_price is_vip_price (optional) current_price (optional) lowest_price (optional) highest_price (optional) old_price (optional) vip_price ''' #sys.path.insert(0,'/home/lequocdo/workspace/leadsdm') #print sys.path params = translateInputParameters(sys.argv) page_content = params[0] page_tree = xpathops.content2tree(page_content) page_dict = {"tree": page_tree} name_extraction_tuples_list = ast.literal_eval(params[1]) price_extraction_tuples_list = ast.literal_eval(params[2]) values_extractor = ExtractValues(page_dict) default_verifier = defaultverifier() values_extractor.extract(name_extraction_tuples_list, default_verifier)
def __fix_namenode_basketnode_lists(self,basketnode_list,page_dict_list,contents_list): namenode_list = [] # non-repeating title words picking title_words_perpage = self.__get_nonrepeating_title_words(contents_list) do,basketnode_list,mostcommon0 = self.__eval_basketnodes2(basketnode_list) for i,(basketpath,page_dict) in enumerate(zip(basketnode_list,page_dict_list)): page_dict["tree"] = xpathops.content2tree(page_dict.get("content")) if do == True: if basketpath == mostcommon0: pass else: node = page_dict["tree"].xpath(mostcommon0) if node is not None: # probably basket node, right? :) basketnode_list[i] = mostcommon0 else: basketnode_list[i] = None namenode_classifier = NameNodeClassifier() if namenode_classifier.classify(page_dict, params=title_words_perpage[i]): namenode = namenode_classifier.getNodePath() print i, namenode namenode_list.append(namenode) else: namenode_list.append(None) del page_dict["tree"] return namenode_list,basketnode_list
def __init__(self, contents_list, namenode_list, buttonnode_list=None, basketnode_list=None): ''' contents_list - content of n pages of the site offering product ''' self.__contents_list = contents_list self.__trees_list = trees_list = [ xpathops.content2tree(content) for content in contents_list ] self.__namenode_list = namenode_list self.__buttonnode_list = buttonnode_list self.__basketnode_list = basketnode_list self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist( trees_list[0]) self.__text_nodes_list = [] self.__text_nodes_paths_list = [] for tree in trees_list: text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree) self.__text_nodes_list.append(text_nodes) self.__text_nodes_paths_list.append(text_nodes_paths)
def create_list(urllist): page_dict_list = [] for url in urllist: print url try: content = requests.get(url).text except requests.exceptions.ConnectionError(): continue page_dict = {"url": url, "content": content, "tree": xpathops.content2tree(content), "lang": "en"} classifier = ProductOfferingPageClassifier() classifier.classify(page_dict) ecom_features = classifier.getFeaturesVals() buttonnode, basketnode = classifier.getExtractedNodes() page_dict["buttonnode"] = buttonnode page_dict["basketnode"] = basketnode page_dict["ecom_features"] = ecom_features page_dict_list.append(page_dict) return page_dict_list
def create_list(urllist): page_dict_list = [] for url in urllist: print url try: content = requests.get(url).text except requests.exceptions.ConnectionError(): continue page_dict = { "url": url, "content": content, "tree": xpathops.content2tree(content), "lang": "en" } classifier = ProductOfferingPageClassifier() classifier.classify(page_dict) ecom_features = classifier.getFeaturesVals() buttonnode, basketnode = classifier.getExtractedNodes() page_dict["buttonnode"] = buttonnode page_dict["basketnode"] = basketnode page_dict["ecom_features"] = ecom_features page_dict_list.append(page_dict) return page_dict_list
def execute(self, params): ''' Parameters: 1 pagecontent 2 objectofextraction ("ecom_product_name","ecom_product_price") 3 extractiontuples Returns: successfulextractiontuple [extractedobjname extractedobjvalue] ... ''' #params = translateInputParameters(sys.argv) #processing = processing_on() page_content = params[0] page_tree = xpathops.content2tree(page_content) # page_content = page_content.replace(unichr(163),'$') # print lxml.html.tostring(page_tree) # print page_content # print page_tree.getpath(page_tree.xpath(u"//*[contains(.,'25')]")[-1]) # print page_tree.xpath('/html/body/section[7]/div[1]//text()') page_dict = {"tree": page_tree} extraction_obj_name = params[1] extraction_tuples_list = params[2] if type(extraction_tuples_list) is tuple: extraction_tuples_list = [extraction_tuples_list] returnlist = [] if len(extraction_tuples_list) > 1 and isinstance( extraction_tuples_list[1], (int, long)): # that means tuple was given as a list extraction_tuples_list = [extraction_tuples_list] if len(extraction_tuples_list) > 0: values_extractor = ExtractValues(page_dict) verifier = extracthelpersfactory.verifierFactoryMethod( extraction_obj_name) miner = extracthelpersfactory.minerFactoryMethod( extraction_obj_name) describedvalues_list, successful_extraction_tuple = values_extractor.extract( extraction_tuples_list, verifier, miner) returnlist.append(successful_extraction_tuple) for nameval in describedvalues_list: returnlist.append(nameval[0]) returnlist.append(nameval[1]) # processing_off(processing) # print translateReturnValues(returnlist) # return translateReturnValues(returnlist) return returnlist
def __get_pages_trees(self): trees = [] for page_dict in self.__page_dict_list: tree = page_dict.get("tree") if tree is None: tree = xpathops.content2tree(page_dict.get("content")) page_dict["tree"] = tree trees.append(tree) return trees
def __get_title(self, code): #f = io.StringIO(unicode(code, 'utf-8').encode('utf-8')) self.tree = xpathops.content2tree(code) titleElements = self.tree.xpath('/html/head//title/text()') if len(titleElements) > 0: titleText = titleElements[0].strip() titleText = filter(lambda x: x in string.printable, titleText) print titleText else: titleText = None return titleText
def execute(self,params): ''' Parameters: 1 pagecontent 2 objectofextraction ("ecom_product_name","ecom_product_price") 3 extractiontuples Returns: successfulextractiontuple [extractedobjname extractedobjvalue] ... ''' #params = translateInputParameters(sys.argv) #processing = processing_on() page_content = params[0] page_tree = xpathops.content2tree(page_content) # page_content = page_content.replace(unichr(163),'$') # print lxml.html.tostring(page_tree) # print page_content # print page_tree.getpath(page_tree.xpath(u"//*[contains(.,'25')]")[-1]) # print page_tree.xpath('/html/body/section[7]/div[1]//text()') page_dict = {"tree":page_tree} extraction_obj_name = params[1] extraction_tuples_list = params[2] if type(extraction_tuples_list) is tuple: extraction_tuples_list = [extraction_tuples_list] returnlist = [] if len(extraction_tuples_list) > 1 and isinstance(extraction_tuples_list[1],(int,long)): # that means tuple was given as a list extraction_tuples_list = [extraction_tuples_list] if len(extraction_tuples_list) > 0: values_extractor = ExtractValues(page_dict) verifier = extracthelpersfactory.verifierFactoryMethod(extraction_obj_name) miner = extracthelpersfactory.minerFactoryMethod(extraction_obj_name) describedvalues_list, successful_extraction_tuple = values_extractor.extract(extraction_tuples_list, verifier, miner) returnlist.append(successful_extraction_tuple) for nameval in describedvalues_list: returnlist.append(nameval[0]) returnlist.append(nameval[1]) # processing_off(processing) # print translateReturnValues(returnlist) # return translateReturnValues(returnlist) return returnlist
def classify(self, page_dict, params=None): self.certainty = None self.nodepath = None self.features = None if page_dict.get("tree") == None: page_dict["tree"] = content2tree(page_dict.get("content")) self.page_dict = page_dict if self.find(params): return True else: return None
def execute(self,params): html = params[0] url = params[1] tree = xpathops.content2tree(html) metacontentvals = tree.xpath("/html/head//meta/@content") metacontentvals.extend(tree.xpath("/html/body//time/@datetime")) search = DateSearch() publishdate = search.getdate(metacontentvals,url) returnlist = [] if publishdate is not None: returnlist.append(publishdate.year) returnlist.append(publishdate.month) returnlist.append(publishdate.day) self.logger.debug(returnlist) return returnlist
def execute(self, params): html = params[0] url = params[1] tree = xpathops.content2tree(html) metacontentvals = tree.xpath("/html/head//meta/@content") metacontentvals.extend(tree.xpath("/html/body//time/@datetime")) search = DateSearch() publishdate = search.getdate(metacontentvals, url) returnlist = [] if publishdate is not None: returnlist.append(publishdate.year) returnlist.append(publishdate.month) returnlist.append(publishdate.day) self.logger.debug(returnlist) return returnlist
def categorize(self, domain_url): content = None page = requests.get('http://www.similarweb.com/website/'+domain_url) if page.status_code == requests.codes.ok: content = page.text if content: ''' scraping bitch ''' tree = xpathops.content2tree(content) category_string = tree.xpath("//*[@id=\"overview\"]/div[2]/div[1]/div/div[1]/div[2]/ul/li[3]/div/h3/span/a/text()") country_string = tree.xpath("//*[@id=\"geo-countries-accordion\"]/div[1]/div[1]/span/h3/text()") return [category_string, country_string] else: return None
def classify(self,page_dict,params=None): self.certainty = None self.nodepath = None self.features = None retval = None if page_dict.get("tree")==None: page_dict["tree"] = content2tree(page_dict.get("content")) self.page_dict = page_dict if self.find(params): retval = True # page_dict["tree"] = None return retval
def prepareSitePagesContent(self,fqdn): ''' fqdn = fully qualified domain name https://kb.iu.edu/d/aiuv ''' pages_dict_list = [] url_list = self.crawl_multiple_article_of_domain(fqdn) for url in url_list: try: content = requests.get(url).text tree = xpathops.content2tree(content) page_dict = {"url":url,"content":content,"tree":tree} pages_dict_list.append(page_dict) except: pass return pages_dict_list
def __init__(self, contents_list, namenode_list, buttonnode_list=None, basketnode_list=None): ''' contents_list - content of n pages of the site offering product ''' self.__contents_list = contents_list self.__trees_list = trees_list = [xpathops.content2tree(content) for content in contents_list] self.__namenode_list = namenode_list self.__buttonnode_list = buttonnode_list self.__basketnode_list = basketnode_list self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist(trees_list[0]) self.__text_nodes_list = [] self.__text_nodes_paths_list = [] for tree in trees_list: text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree) self.__text_nodes_list.append(text_nodes) self.__text_nodes_paths_list.append(text_nodes_paths)
def execute(self,params): ''' Parameters: 1 page content 2 nameextractiontuples 3 pricextractiontuples Returns: name price_currency is_current_price is_lowest_price is_highest_price is_old_price is_vip_price (optional) current_price (optional) lowest_price (optional) highest_price (optional) old_price (optional) vip_price ''' #sys.path.insert(0,'/home/lequocdo/workspace/leadsdm') #print sys.path params = translateInputParameters(sys.argv) page_content = params[0] page_tree = xpathops.content2tree(page_content) page_dict = {"tree":page_tree} name_extraction_tuples_list = ast.literal_eval(params[1]) price_extraction_tuples_list = ast.literal_eval(params[2]) values_extractor = ExtractValues(page_dict) default_verifier = defaultverifier() values_extractor.extract(name_extraction_tuples_list, default_verifier)
def categorize(self, domain_url): content = None page = requests.get('http://www.similarweb.com/website/' + domain_url) if page.status_code == requests.codes.ok: content = page.text if content: ''' scraping bitch ''' tree = xpathops.content2tree(content) category_string = tree.xpath( "//*[@id=\"overview\"]/div[2]/div[1]/div/div[1]/div[2]/ul/li[3]/div/h3/span/a/text()" ) country_string = tree.xpath( "//*[@id=\"geo-countries-accordion\"]/div[1]/div[1]/span/h3/text()" ) return [category_string, country_string] else: return None
def execute(self,params): ''' Parameters: [url content lang ecom_features buttonnode basketnode] [url content lang ecom_features buttonnode basketnode] ... Returns: nameextractiontuples priceextractiontuples ''' returnlist = [] #params = translateInputParameters(sys.argv) page_dict_list = [] for i in range(len(params)/6): url_index = i*6 content_index = url_index+1 lang_index = url_index+2 features_index = url_index+3 button_index = url_index+4 basket_index = url_index+5 page_dict = { "url": params[url_index], "content": params[content_index], "lang": params[lang_index], "ecom_features": params[features_index], "buttonnode": params[button_index], "basketnode": params[basket_index] } page_dict["tree"] = xpathops.content2tree(page_dict.get("content")) page_dict_list.append(page_dict) correct_clustering = True clusterPages = ClusterProductCategoryPages() for K in range(5,1,-1): clusterPages.setK(K) print "KMeans: "+str(K) clusterPages.cluster_test(page_dict_list)
def __init__(self, url, code): ''' Constructor ''' self.__code = code self.__tree = xpathops.content2tree(code)
''' import ast import requests from eu.leads.infext.python.ops import xpathops from eu.leads.infext.python.__temp.FileIterator import FileIterator if __name__ == '__main__': fi = FileIterator( "/home/nonlinear/workspace/leadsdm/eu/leads/infext/python/__temp/files/www.amazon.com" ) for line in fi.getLine(): if line.startswith("#") or line.startswith("//"): continue url = line[:-1] content = requests.get(url).text print url tree = xpathops.content2tree(content) print len(content) titleNodes = tree.xpath('//title/text()') if len(titleNodes) > 0: titleText = titleNodes[0].strip() print titleText titlePath = xpathops.element2parentpath(tree, titleNodes[0]) print titlePath else: print "NO TITLE" print "---" # table = [0.54433105, -0.81649658, -0.75842509, -0.75282172, -0.79788596, -0.58532848, 0.54433105] # print str(table) # string = '[0.54433105 -0.81649658 -0.75842509 -0.75282172 -0.79788596 -0.58532848 0.54433105]' # print ast.literal_eval(string)
def classify_page(self,page_dict,is_domain_verified): export = ExportAlt() url = page_dict.get("url") content = page_dict.get("content") tree = xpathops.content2tree(content) self.pagetype = '' sd = SDAlgorithm() sd.tree = tree analysis_result = sd.analyze_page() for x in analysis_result[3]: print "FULL TEXT:" print x.full_text if analysis_result[0] == 'article': isarticle = self.verify.verify_article(url, content, analysis_result[1]) if isarticle: self.extr.extract_article(url) export.export_urls(url, 'proper', 'article', analysis_result[1]) elif not isarticle: self.pagetype = self.reclassify() export.export_urls(url, 'improper', 'article', analysis_result[1]) if is_domain_verified is not None: if is_domain_verified: self.extr.extract_with_best_schema(url, self.pagetype) elif not is_domain_verified: analysis = DomainAnalysis() analysis.process_domain(url) self.extr.extract_with_best_schema(url, self.pagetype) elif analysis_result[0] == 'comment': isarticle = self.verify.verify_article(analysis_result[1]) iscomments = self.verify.verify_comments(analysis_result[2], tree) if isarticle and iscomments: self.extr.extract_article(url) self.extr.extract_comment(analysis_result[2], url) export.export_urls(url, 'proper', 'comment', analysis_result[1], analysis_result[2]) else: self.pagetype = self.reclassify('comment') export.export_urls(url, 'improper', 'comment', analysis_result[1], analysis_result[2]) if is_domain_verified is not None: if not is_domain_verified: analysis = DomainAnalysis() analysis.process_domain(url) self.extr.extract_with_best_schema(url, self.pagetype) elif analysis_result[0] == 'multiple': ismultiple = self.verify.verify_multiple_articles(analysis_result[3], url, tree, content) if ismultiple: self.extr.extract_multiple_article(analysis_result[3], url) export.export_urls(url, 'proper', 'multiple', None, analysis_result[3]) else: self.pagetype = self.reclassify('multiple') export.export_urls(url, 'improper', 'multiple', None, analysis_result[3]) if is_domain_verified is not None: if not is_domain_verified: analysis = DomainAnalysis() analysis.process_domain(url) self.extr.extract_with_best_schema(url, self.pagetype)
@author: nonlinear ''' import ast import requests from eu.leads.infext.python.ops import xpathops from eu.leads.infext.python.__temp.FileIterator import FileIterator if __name__ == '__main__': fi = FileIterator("/home/nonlinear/workspace/leadsdm/eu/leads/infext/python/__temp/files/www.amazon.com") for line in fi.getLine(): if line.startswith("#") or line.startswith("//"): continue url = line[:-1] content = requests.get(url).text print url tree = xpathops.content2tree(content) print len(content) titleNodes = tree.xpath('//title/text()') if len(titleNodes)>0: titleText = titleNodes[0].strip() print titleText titlePath = xpathops.element2parentpath(tree, titleNodes[0]) print titlePath else: print "NO TITLE" print "---" # table = [0.54433105, -0.81649658, -0.75842509, -0.75282172, -0.79788596, -0.58532848, 0.54433105] # print str(table) # string = '[0.54433105 -0.81649658 -0.75842509 -0.75282172 -0.79788596 -0.58532848 0.54433105]' # print ast.literal_eval(string)