Exemplo n.º 1
0
def pipe_fetchdata(context, _INPUT, conf, **kwargs):
    """This source fetches and parses any XML or JSON file (todo iCal or KML) to yield a list of elements.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        URL -- url
        path -- path to list
    
    Yields (_OUTPUT):
    elements
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)

            if not '://' in url:
                url = 'http://' + url
            path = util.get_value(conf['path'], item, **kwargs)
            match = None

            #Parse the file into a dictionary
            try:
                f = urllib2.urlopen(url)
                ft = ElementTree.parse(f)
                if context.verbose:
                    print "pipe_fetchdata loading xml:", url
                root = ft.getroot()
                #Move to the point referenced by the path
                #todo lxml would simplify and speed up this
                if path:
                    if root.tag[0] == '{':
                        namespace = root.tag[1:].split("}")[0]
                        for i in path.split(".")[:-1]:
                            root = root.find("{%s}%s" % (namespace, i))
                            if root is None:
                                return
                        match = "{%s}%s" % (namespace, path.split(".")[-1])
                    else:
                        match = "%s" % (path.split(".")[-1])
                #Convert xml into generation of dicts
                if match:
                    for element in root.findall(match):
                        i = util.etree_to_pipes(element)
                        yield i
                else:
                    i = util.etree_to_pipes(root)
                    yield i

            except Exception, e:
                try:
                    f = urllib2.urlopen(url)
                    d = json.load(f)
                    #todo test:-
                    if context.verbose:
                        print "pipe_fetchdata loading json:", url
                    if path:
                        for i in path.split(".")[:-1]:
                            d = d.get(i)
                        match = path.split(".")[-1]
                    if match:
                        for itemd in d:
                            if not match or itemd == match:
                                if isinstance(d[itemd], list):
                                    for nested_item in d[itemd]:
                                        yield nested_item
                                else:
                                    yield [d[itemd]]
                    else:
                        yield d
                except Exception, e:
                    #todo try iCal and yield
                    #todo try KML and yield
                    if context.verbose:
                        print "xml and json both failed:"

                    raise
Exemplo n.º 2
0
def pipe_fetchdata(context, _INPUT, conf,  **kwargs):
    """This source fetches and parses any XML or JSON file (todo iCal or KML) to yield a list of elements.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        URL -- url
        path -- path to list
    
    Yields (_OUTPUT):
    elements
    """
    url = util.get_value(conf['URL'], None, **kwargs) #todo use subkey?
    if not '://' in url:
        url = 'http://' + url
    path = util.get_value(conf['path'], None, **kwargs) #todo use subkey?
    match = None
    
    #Parse the file into a dictionary
    try:
        f = urllib2.urlopen(url)
        ft = ElementTree.parse(f)
        if context.verbose:
            print "pipe_fetchdata loading xml:", url
        root = ft.getroot()
        #Move to the point referenced by the path
        #todo lxml would simplify and speed up this
        if path:
            if root.tag[0] == '{':
                namespace = root.tag[1:].split("}")[0]
                for i in path.split(".")[:-1]:
                    root = root.find("{%s}%s" % (namespace, i))
                    if root is None:
                        return
                match = "{%s}%s" % (namespace, path.split(".")[-1])
            else:
                match = "%s" % (path.split(".")[-1])
        #Convert xml into generation of dicts
        if match:
            for element in root.findall(match):
                i = util.etree_to_pipes(element)           
                yield i
        else:
            i = util.etree_to_pipes(root)
            yield i
            
    except Exception, e:
        try:
            f = urllib2.urlopen(url)
            d = json.load(f)
            #todo test:-
            if context.verbose:
                print "pipe_fetchdata loading json:", url
            if path:
                for i in path.split(".")[:-1]:
                    d = d.get(i)
                match = path.split(".")[-1]
            if match:
                for item in d:
                    if not match or item == match:
                        if isinstance(d[item], list):
                            for nested_item in d[item]:
                                yield nested_item
                        else:
                            yield d[item]
            else:
                yield d
        except Exception, e:
            #todo try iCal and yield
            #todo try KML and yield
            if context.verbose:
                print "xml and json both failed:"

            raise
Exemplo n.º 3
0
def pipe_xpathfetchpage(context, _INPUT, conf, **kwargs):
    """XPath Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if context.verbose:
                print "XPathFetchPage: Preparing to download:", url

            try:
                request = urllib2.Request(url)
                request.add_header('User-Agent', 'Yahoo Pipes 1.0')
                request = urllib2.build_opener().open(request)
                content = unicode(
                    request.read(),
                    request.headers['content-type'].split('charset=')[-1])

                # TODO it seems that Yahoo! converts relative links to absolute
                # TODO this needs to be done on the content but seems to be a non-trival
                # TODO task python?

                xpath = util.get_value(conf["xpath"], _INPUT, **kwargs)
                html5 = False
                useAsString = False
                if "html5" in conf:
                    html5 = util.get_value(conf["html5"], _INPUT,
                                           **kwargs) == "true"
                if "useAsString" in conf:
                    useAsString = util.get_value(conf["useAsString"], _INPUT,
                                                 **kwargs) == "true"

                if html5:
                    #from lxml.html import html5parser
                    #root = html5parser.fromstring(content)
                    from html5lib import parse
                    root = parse(content,
                                 treebuilder='lxml',
                                 namespaceHTMLElements=False)
                else:
                    from lxml import etree
                    root = etree.HTML(content)
                res_items = root.xpath(xpath)

                if context.verbose:
                    print "XPathFetchPage: found count items:", len(res_items)

                for res_item in res_items:
                    i = util.etree_to_pipes(
                        res_item)  #TODO xml_to_dict(res_item)
                    if context.verbose:
                        print "--------------item data --------------------"
                        print i
                        print "--------------EOF item data ----------------"
                    if useAsString:
                        yield {"content": unicode(i)}
                    else:
                        yield i

            except Exception, e:
                if context.verbose:
                    print "XPathFetchPage: failed to retrieve from:", url

                    print "----------------- XPathFetchPage -----------------"
                    import traceback
                    traceback.print_exc()
                    print "----------------- XPathFetchPage -----------------"
                raise

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Exemplo n.º 4
0
def pipe_xpathfetchpage(context, _INPUT, conf, **kwargs):
    """XPath Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if context.verbose:
                print "XPathFetchPage: Preparing to download:",url
                
            try:
                request = urllib2.Request(url)
                request.add_header('User-Agent','Yahoo Pipes 1.0')
                request = urllib2.build_opener().open(request)
                content = unicode(request.read(),
                                  request.headers['content-type'].split('charset=')[-1])
        
                # TODO it seems that Yahoo! converts relative links to absolute
                # TODO this needs to be done on the content but seems to be a non-trival
                # TODO task python?
        
                xpath = util.get_value(conf["xpath"], _INPUT, **kwargs)
                html5 = False
                useAsString = False
                if "html5" in conf:
                    html5 = util.get_value(conf["html5"], _INPUT, **kwargs) == "true"
                if "useAsString" in conf:
                    useAsString = util.get_value(conf["useAsString"], _INPUT, **kwargs) == "true"
                
                
                if html5:
                    #from lxml.html import html5parser
                    #root = html5parser.fromstring(content)
                    from html5lib import parse
                    root = parse(content, treebuilder='lxml', namespaceHTMLElements=False)
                else:
                    from lxml import etree
                    root = etree.HTML(content)
                res_items = root.xpath(xpath)
                
                if context.verbose:
                    print "XPathFetchPage: found count items:",len(res_items)
        
                for res_item in res_items:
                    i = util.etree_to_pipes(res_item) #TODO xml_to_dict(res_item)                    
                    if context.verbose:
                        print "--------------item data --------------------"
                        print i
                        print "--------------EOF item data ----------------"
                    if useAsString:
                        yield { "content" : unicode(i) }
                    else:
                        yield i
        
            except Exception, e:
                if context.verbose:
                    print "XPathFetchPage: failed to retrieve from:", url
        
                    print "----------------- XPathFetchPage -----------------"
                    import traceback
                    traceback.print_exc()
                    print "----------------- XPathFetchPage -----------------"
                raise

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
            
Exemplo n.º 5
0
def pipe_xpathfetchpage(context, _INPUT, conf, **kwargs):
    """XPath Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if context.verbose:
                print "XPathFetchPage: Preparing to download:",url
                
            try:
                content = util.fetch_url(url)
        
                xpath = util.get_value(conf["xpath"], _INPUT, **kwargs)
                html5 = False
                useAsString = False
                if "html5" in conf:
                    html5 = util.get_value(conf["html5"], _INPUT, **kwargs) == "true"
                if "useAsString" in conf:
                    useAsString = util.get_value(conf["useAsString"], _INPUT, **kwargs) == "true"
                
                
                if html5:
                    #from lxml.html import html5parser
                    #root = html5parser.fromstring(content)
                    from html5lib import parse
                    root = parse(content, treebuilder='lxml', namespaceHTMLElements=False)
                else:
                    from lxml import etree
                    root = etree.HTML(content)
                res_items = root.xpath(xpath)
                
                if context.verbose:
                    print "XPathFetchPage: found count items:",len(res_items)
                    if len(res_items) == 0:
                        print "Content: %s" % content
                for res_item in res_items:
                    i = util.etree_to_pipes(res_item) #TODO xml_to_dict(res_item)                    
                    if context.verbose:
                        print "--------------item data --------------------"
                        print i
                        print "--------------EOF item data ----------------"
                    if useAsString:
                        yield { "content" : unicode(i) }
                    else:
                        yield i
        
            except Exception, e:
                if context.verbose:
                    print "XPathFetchPage: failed to retrieve from:", url
        
                    print "----------------- XPathFetchPage -----------------"
                    import traceback
                    traceback.print_exc()
                    print "----------------- XPathFetchPage -----------------"
                raise

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break