def execute(self, params):
        '''
        Parameters:
        1 page content
        2 nameextractiontuples
        3 pricextractiontuples
        
        Returns:
        name
        price_currency
        is_current_price is_lowest_price is_highest_price is_old_price is_vip_price
        (optional) current_price
        (optional) lowest_price
        (optional) highest_price
        (optional) old_price
        (optional) vip_price
        '''
        #sys.path.insert(0,'/home/lequocdo/workspace/leadsdm')
        #print sys.path

        params = translateInputParameters(sys.argv)

        page_content = params[0]
        page_tree = xpathops.content2tree(page_content)
        page_dict = {"tree": page_tree}

        name_extraction_tuples_list = ast.literal_eval(params[1])
        price_extraction_tuples_list = ast.literal_eval(params[2])

        values_extractor = ExtractValues(page_dict)
        default_verifier = defaultverifier()
        values_extractor.extract(name_extraction_tuples_list, default_verifier)
 def __fix_namenode_basketnode_lists(self,basketnode_list,page_dict_list,contents_list):
     namenode_list = []
     
     # non-repeating title words picking
     title_words_perpage = self.__get_nonrepeating_title_words(contents_list)
     
     do,basketnode_list,mostcommon0 = self.__eval_basketnodes2(basketnode_list)
     
     for i,(basketpath,page_dict) in enumerate(zip(basketnode_list,page_dict_list)):
         page_dict["tree"] = xpathops.content2tree(page_dict.get("content"))
         
         if do == True:
             if basketpath == mostcommon0:
                 pass
             else:
                 node = page_dict["tree"].xpath(mostcommon0)
                 if node is not None:
                     # probably basket node, right? :)
                     basketnode_list[i] = mostcommon0
                 else:
                     basketnode_list[i] = None         
         
         namenode_classifier = NameNodeClassifier() 
         if namenode_classifier.classify(page_dict, params=title_words_perpage[i]):
             namenode = namenode_classifier.getNodePath()
             print i, namenode
             namenode_list.append(namenode)
         else:
             namenode_list.append(None)
             
         del page_dict["tree"]
             
     return namenode_list,basketnode_list
示例#3
0
    def __init__(self,
                 contents_list,
                 namenode_list,
                 buttonnode_list=None,
                 basketnode_list=None):
        '''
        contents_list - content of n pages of the site offering product
        '''
        self.__contents_list = contents_list
        self.__trees_list = trees_list = [
            xpathops.content2tree(content) for content in contents_list
        ]

        self.__namenode_list = namenode_list
        self.__buttonnode_list = buttonnode_list
        self.__basketnode_list = basketnode_list

        self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist(
            trees_list[0])

        self.__text_nodes_list = []
        self.__text_nodes_paths_list = []
        for tree in trees_list:
            text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree)
            self.__text_nodes_list.append(text_nodes)
            self.__text_nodes_paths_list.append(text_nodes_paths)
def create_list(urllist):
    page_dict_list = []
    
    for url in urllist:
        print url
        try:
            content = requests.get(url).text
        except requests.exceptions.ConnectionError():
            continue
        page_dict = {"url": url,
                     "content": content,
                     "tree": xpathops.content2tree(content),
                     "lang": "en"}
        
        classifier = ProductOfferingPageClassifier()
        classifier.classify(page_dict)
        
        ecom_features = classifier.getFeaturesVals()
        buttonnode, basketnode = classifier.getExtractedNodes()
        
        page_dict["buttonnode"] = buttonnode
        page_dict["basketnode"] = basketnode
        page_dict["ecom_features"] = ecom_features
        
        page_dict_list.append(page_dict)
        
    return page_dict_list
示例#5
0
def create_list(urllist):
    page_dict_list = []

    for url in urllist:
        print url
        try:
            content = requests.get(url).text
        except requests.exceptions.ConnectionError():
            continue
        page_dict = {
            "url": url,
            "content": content,
            "tree": xpathops.content2tree(content),
            "lang": "en"
        }

        classifier = ProductOfferingPageClassifier()
        classifier.classify(page_dict)

        ecom_features = classifier.getFeaturesVals()
        buttonnode, basketnode = classifier.getExtractedNodes()

        page_dict["buttonnode"] = buttonnode
        page_dict["basketnode"] = basketnode
        page_dict["ecom_features"] = ecom_features

        page_dict_list.append(page_dict)

    return page_dict_list
    def execute(self, params):
        '''
        Parameters:
        1 pagecontent
        2 objectofextraction ("ecom_product_name","ecom_product_price")
        3 extractiontuples
        
        Returns:
        successfulextractiontuple
        [extractedobjname
        extractedobjvalue]
        ...
        '''

        #params = translateInputParameters(sys.argv)
        #processing = processing_on()

        page_content = params[0]
        page_tree = xpathops.content2tree(page_content)
        #    page_content = page_content.replace(unichr(163),'$')
        #     print lxml.html.tostring(page_tree)
        #     print page_content
        #     print page_tree.getpath(page_tree.xpath(u"//*[contains(.,'25')]")[-1])
        #     print page_tree.xpath('/html/body/section[7]/div[1]//text()')
        page_dict = {"tree": page_tree}

        extraction_obj_name = params[1]
        extraction_tuples_list = params[2]
        if type(extraction_tuples_list) is tuple:
            extraction_tuples_list = [extraction_tuples_list]

        returnlist = []

        if len(extraction_tuples_list) > 1 and isinstance(
                extraction_tuples_list[1], (int, long)):
            # that means tuple was given as a list
            extraction_tuples_list = [extraction_tuples_list]

        if len(extraction_tuples_list) > 0:

            values_extractor = ExtractValues(page_dict)
            verifier = extracthelpersfactory.verifierFactoryMethod(
                extraction_obj_name)
            miner = extracthelpersfactory.minerFactoryMethod(
                extraction_obj_name)

            describedvalues_list, successful_extraction_tuple = values_extractor.extract(
                extraction_tuples_list, verifier, miner)

            returnlist.append(successful_extraction_tuple)
            for nameval in describedvalues_list:
                returnlist.append(nameval[0])
                returnlist.append(nameval[1])

        # processing_off(processing)
        # print translateReturnValues(returnlist)
        # return translateReturnValues(returnlist)
        return returnlist
 def __get_pages_trees(self):
     trees = []
     for page_dict in self.__page_dict_list:
         tree = page_dict.get("tree")
         if tree is None:
             tree = xpathops.content2tree(page_dict.get("content"))
             page_dict["tree"] = tree
         trees.append(tree)
     return trees
 def __get_title(self, code):
     #f = io.StringIO(unicode(code, 'utf-8').encode('utf-8'))
     self.tree = xpathops.content2tree(code)
     titleElements = self.tree.xpath('/html/head//title/text()')
     if len(titleElements) > 0:
         titleText = titleElements[0].strip()
         titleText = filter(lambda x: x in string.printable, titleText)
         print titleText
     else:
         titleText = None
     return titleText
    def execute(self,params):
        '''
        Parameters:
        1 pagecontent
        2 objectofextraction ("ecom_product_name","ecom_product_price")
        3 extractiontuples
        
        Returns:
        successfulextractiontuple
        [extractedobjname
        extractedobjvalue]
        ...
        '''
        
        #params = translateInputParameters(sys.argv)
        #processing = processing_on()
        
        page_content = params[0]
        page_tree = xpathops.content2tree(page_content)
    #    page_content = page_content.replace(unichr(163),'$')
    #     print lxml.html.tostring(page_tree)
    #     print page_content
    #     print page_tree.getpath(page_tree.xpath(u"//*[contains(.,'25')]")[-1])
    #     print page_tree.xpath('/html/body/section[7]/div[1]//text()')
        page_dict = {"tree":page_tree}
        
        extraction_obj_name = params[1]
        extraction_tuples_list = params[2]
        if type(extraction_tuples_list) is tuple:
            extraction_tuples_list = [extraction_tuples_list]       
        
        returnlist = []

 	if len(extraction_tuples_list) > 1 and isinstance(extraction_tuples_list[1],(int,long)):
            # that means tuple was given as a list
            extraction_tuples_list = [extraction_tuples_list]
        
        if len(extraction_tuples_list) > 0:
                      
            values_extractor = ExtractValues(page_dict)
            verifier = extracthelpersfactory.verifierFactoryMethod(extraction_obj_name)
            miner = extracthelpersfactory.minerFactoryMethod(extraction_obj_name)
            
            describedvalues_list, successful_extraction_tuple = values_extractor.extract(extraction_tuples_list, verifier, miner)
            
            returnlist.append(successful_extraction_tuple)
            for nameval in describedvalues_list:
                returnlist.append(nameval[0])
                returnlist.append(nameval[1])
            
        # processing_off(processing)
        # print translateReturnValues(returnlist)
        # return translateReturnValues(returnlist)
        return returnlist
    def classify(self, page_dict, params=None):
        self.certainty = None
        self.nodepath = None
        self.features = None

        if page_dict.get("tree") == None:
            page_dict["tree"] = content2tree(page_dict.get("content"))

        self.page_dict = page_dict

        if self.find(params):
            return True
        else:
            return None
 def execute(self,params):
     html = params[0]
     url = params[1]
     tree = xpathops.content2tree(html)
     metacontentvals = tree.xpath("/html/head//meta/@content")
     metacontentvals.extend(tree.xpath("/html/body//time/@datetime"))
     search = DateSearch()
     publishdate = search.getdate(metacontentvals,url)
     returnlist = []
     if publishdate is not None:
         returnlist.append(publishdate.year)
         returnlist.append(publishdate.month)
         returnlist.append(publishdate.day)
     self.logger.debug(returnlist)
     return returnlist
示例#12
0
 def execute(self, params):
     html = params[0]
     url = params[1]
     tree = xpathops.content2tree(html)
     metacontentvals = tree.xpath("/html/head//meta/@content")
     metacontentvals.extend(tree.xpath("/html/body//time/@datetime"))
     search = DateSearch()
     publishdate = search.getdate(metacontentvals, url)
     returnlist = []
     if publishdate is not None:
         returnlist.append(publishdate.year)
         returnlist.append(publishdate.month)
         returnlist.append(publishdate.day)
     self.logger.debug(returnlist)
     return returnlist
 def categorize(self, domain_url):
     
     content = None
     page = requests.get('http://www.similarweb.com/website/'+domain_url)
     
     if page.status_code == requests.codes.ok:
         content = page.text
         
     if content:
         ''' scraping bitch '''
         tree = xpathops.content2tree(content)
         category_string = tree.xpath("//*[@id=\"overview\"]/div[2]/div[1]/div/div[1]/div[2]/ul/li[3]/div/h3/span/a/text()")
         country_string = tree.xpath("//*[@id=\"geo-countries-accordion\"]/div[1]/div[1]/span/h3/text()")
         return [category_string, country_string]
     else:
         return None
    def classify(self,page_dict,params=None):
        self.certainty = None
        self.nodepath = None
        self.features = None
        
        retval = None
        
        if page_dict.get("tree")==None:
            page_dict["tree"] = content2tree(page_dict.get("content"))
            
        self.page_dict = page_dict
            
        if self.find(params):
            retval = True
            
#        page_dict["tree"] = None
        
        return retval
示例#15
0
 def prepareSitePagesContent(self,fqdn):
     '''
     fqdn = fully qualified domain name https://kb.iu.edu/d/aiuv
     '''
     pages_dict_list = []
     
     url_list = self.crawl_multiple_article_of_domain(fqdn)
     
     for url in url_list:
         try:
             content = requests.get(url).text
             tree = xpathops.content2tree(content)
             page_dict = {"url":url,"content":content,"tree":tree}
             pages_dict_list.append(page_dict)
         except:
             pass
     
     return pages_dict_list
 def __init__(self, contents_list, namenode_list, buttonnode_list=None, basketnode_list=None):
     '''
     contents_list - content of n pages of the site offering product
     '''
     self.__contents_list = contents_list
     self.__trees_list = trees_list = [xpathops.content2tree(content) for content in contents_list]
     
     self.__namenode_list = namenode_list
     self.__buttonnode_list = buttonnode_list
     self.__basketnode_list = basketnode_list
             
     self.__prim_text_nodes, self.__prim_text_nodes_paths = xpathops.tree2textnodeslist(trees_list[0])
     
     self.__text_nodes_list = []
     self.__text_nodes_paths_list = []
     for tree in trees_list:
         text_nodes, text_nodes_paths = xpathops.tree2textnodeslist(tree)
         self.__text_nodes_list.append(text_nodes)
         self.__text_nodes_paths_list.append(text_nodes_paths)
 def execute(self,params):
     '''
     Parameters:
     1 page content
     2 nameextractiontuples
     3 pricextractiontuples
     
     Returns:
     name
     price_currency
     is_current_price is_lowest_price is_highest_price is_old_price is_vip_price
     (optional) current_price
     (optional) lowest_price
     (optional) highest_price
     (optional) old_price
     (optional) vip_price
     '''
     #sys.path.insert(0,'/home/lequocdo/workspace/leadsdm')
     #print sys.path
     
     params = translateInputParameters(sys.argv)
     
     page_content = params[0]
     page_tree = xpathops.content2tree(page_content)
     page_dict = {"tree":page_tree}
     
     name_extraction_tuples_list = ast.literal_eval(params[1])
     price_extraction_tuples_list = ast.literal_eval(params[2])
     
     values_extractor = ExtractValues(page_dict)
     default_verifier = defaultverifier()
     values_extractor.extract(name_extraction_tuples_list, default_verifier)
     
     
     
     
     
     
     
     
     
     
示例#18
0
    def categorize(self, domain_url):

        content = None
        page = requests.get('http://www.similarweb.com/website/' + domain_url)

        if page.status_code == requests.codes.ok:
            content = page.text

        if content:
            ''' scraping bitch '''
            tree = xpathops.content2tree(content)
            category_string = tree.xpath(
                "//*[@id=\"overview\"]/div[2]/div[1]/div/div[1]/div[2]/ul/li[3]/div/h3/span/a/text()"
            )
            country_string = tree.xpath(
                "//*[@id=\"geo-countries-accordion\"]/div[1]/div[1]/span/h3/text()"
            )
            return [category_string, country_string]
        else:
            return None
示例#19
0
 def execute(self,params):
     '''
     Parameters:
     [url content lang ecom_features buttonnode basketnode] [url content lang ecom_features buttonnode basketnode] ...
     
     Returns:
     nameextractiontuples
     priceextractiontuples
     '''
     
     returnlist = []
 
     #params = translateInputParameters(sys.argv)
      
     page_dict_list = []
     for i in range(len(params)/6):
         url_index      = i*6
         content_index  = url_index+1
         lang_index     = url_index+2
         features_index = url_index+3
         button_index   = url_index+4
         basket_index   = url_index+5
         page_dict = { "url": params[url_index],
                       "content": params[content_index],
                       "lang": params[lang_index],
                       "ecom_features": params[features_index],
                       "buttonnode": params[button_index],
                       "basketnode": params[basket_index] }
         page_dict["tree"] = xpathops.content2tree(page_dict.get("content"))
         page_dict_list.append(page_dict) 
     
     correct_clustering = True
     clusterPages = ClusterProductCategoryPages()
     for K in range(5,1,-1):
         clusterPages.setK(K)
         print "KMeans: "+str(K)
         clusterPages.cluster_test(page_dict_list)
示例#20
0
 def __init__(self, url, code):
     '''
     Constructor
     '''
     self.__code = code
     self.__tree = xpathops.content2tree(code)
 def __init__(self, url, code):
     '''
     Constructor
     '''
     self.__code = code
     self.__tree = xpathops.content2tree(code)
'''
import ast
import requests
from eu.leads.infext.python.ops import xpathops
from eu.leads.infext.python.__temp.FileIterator import FileIterator

if __name__ == '__main__':
    fi = FileIterator(
        "/home/nonlinear/workspace/leadsdm/eu/leads/infext/python/__temp/files/www.amazon.com"
    )
    for line in fi.getLine():
        if line.startswith("#") or line.startswith("//"):
            continue
        url = line[:-1]
        content = requests.get(url).text
        print url
        tree = xpathops.content2tree(content)
        print len(content)
        titleNodes = tree.xpath('//title/text()')
        if len(titleNodes) > 0:
            titleText = titleNodes[0].strip()
            print titleText
            titlePath = xpathops.element2parentpath(tree, titleNodes[0])
            print titlePath
        else:
            print "NO TITLE"
        print "---"
#     table = [0.54433105, -0.81649658, -0.75842509, -0.75282172, -0.79788596, -0.58532848, 0.54433105]
#     print str(table)
#     string = '[0.54433105 -0.81649658 -0.75842509 -0.75282172 -0.79788596 -0.58532848 0.54433105]'
#     print ast.literal_eval(string)
示例#23
0
    def classify_page(self,page_dict,is_domain_verified):
        
        export = ExportAlt()
        
        url = page_dict.get("url")
        content = page_dict.get("content")
        tree = xpathops.content2tree(content)
        self.pagetype = ''

        sd = SDAlgorithm()
        sd.tree = tree
        analysis_result = sd.analyze_page()
        
        for x in analysis_result[3]:
            print "FULL TEXT:"
            print x.full_text

        if analysis_result[0] == 'article':
            isarticle = self.verify.verify_article(url, content, analysis_result[1])

            if isarticle:
                self.extr.extract_article(url)
                export.export_urls(url, 'proper', 'article', analysis_result[1])
            elif not isarticle:
                self.pagetype = self.reclassify()
                export.export_urls(url, 'improper', 'article', analysis_result[1])

                if is_domain_verified is not None:
                    if is_domain_verified:
                        self.extr.extract_with_best_schema(url, self.pagetype)
                    elif not is_domain_verified:
                        analysis = DomainAnalysis()
                        analysis.process_domain(url)
                        self.extr.extract_with_best_schema(url, self.pagetype)


        elif analysis_result[0] == 'comment':
            isarticle = self.verify.verify_article(analysis_result[1])
            iscomments = self.verify.verify_comments(analysis_result[2], tree)

            if isarticle and iscomments:
                self.extr.extract_article(url)
                self.extr.extract_comment(analysis_result[2], url)
                export.export_urls(url, 'proper', 'comment', analysis_result[1], analysis_result[2])
            else:
                self.pagetype = self.reclassify('comment')
                export.export_urls(url, 'improper', 'comment', analysis_result[1], analysis_result[2])

                if is_domain_verified is not None:
                    if not is_domain_verified:
                        analysis = DomainAnalysis()
                        analysis.process_domain(url)
                    self.extr.extract_with_best_schema(url, self.pagetype)

        elif analysis_result[0] == 'multiple':
            ismultiple = self.verify.verify_multiple_articles(analysis_result[3], url, tree, content)

            if ismultiple:
                self.extr.extract_multiple_article(analysis_result[3], url)
                export.export_urls(url, 'proper', 'multiple', None, analysis_result[3])
            else:
                self.pagetype = self.reclassify('multiple')
                export.export_urls(url, 'improper', 'multiple', None, analysis_result[3])

                if is_domain_verified is not None:
                    if not is_domain_verified:
                        analysis = DomainAnalysis()
                        analysis.process_domain(url)
                    self.extr.extract_with_best_schema(url, self.pagetype)
@author: nonlinear
'''
import ast
import requests
from eu.leads.infext.python.ops import xpathops
from eu.leads.infext.python.__temp.FileIterator import FileIterator

if __name__ == '__main__':
    fi = FileIterator("/home/nonlinear/workspace/leadsdm/eu/leads/infext/python/__temp/files/www.amazon.com")
    for line in fi.getLine():
        if line.startswith("#") or line.startswith("//"):
            continue
        url = line[:-1]
        content = requests.get(url).text
        print url
        tree = xpathops.content2tree(content)
        print len(content)
        titleNodes = tree.xpath('//title/text()')
        if len(titleNodes)>0:
            titleText = titleNodes[0].strip()
            print titleText
            titlePath = xpathops.element2parentpath(tree, titleNodes[0])
            print titlePath
        else:
            print "NO TITLE"
        print "---"
#     table = [0.54433105, -0.81649658, -0.75842509, -0.75282172, -0.79788596, -0.58532848, 0.54433105]
#     print str(table)
#     string = '[0.54433105 -0.81649658 -0.75842509 -0.75282172 -0.79788596 -0.58532848 0.54433105]'
#     print ast.literal_eval(string)