def get_element(url): try: tree = objectify.parse(urlopen(url)) root = tree.getroot() except XMLSyntaxError: element = loads(urlopen(url).read()) else: # print etree.tostring(element, pretty_print=True) element = utils.etree_to_dict(root) return element
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs): """A source that issues YQL queries. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : yqlquery -- YQL query # todo: handle envURL Yields ------ _OUTPUT : query results """ # todo: get from a config/env file url = "http://query.yahooapis.com/v1/public/yql" conf = DotDict(conf) query = conf['yqlquery'] for item in _INPUT: item = DotDict(item) yql = utils.get_value(query, item, **kwargs) # note: we use the default format of xml since json loses some # structure # todo: diagnostics=true e.g. if context.test # todo: consider paging for large result sets r = requests.get(url, params={'q': yql}, stream=True) # Parse the response tree = parse(r.raw) if context and context.verbose: print "pipe_yql loading xml:", yql root = tree.getroot() # note: query also has row count results = root.find('results') # Convert xml into generation of dicts for element in results.getchildren(): yield utils.etree_to_dict(element) if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given website as DOM nodes or a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' xpath = conf.get('xpath', **kwargs) html5 = conf.get('html5', **kwargs) == 'true' use_as_string = conf.get('useAsString', **kwargs) == 'true' tree = html5parser.parse(f) if html5 else html.parse(f) root = tree.getroot() items = root.xpath(xpath) if context and context.verbose: print 'XPathFetchPage: found count items:', len(items) for etree in items: i = utils.etree_to_dict(etree) if context and context.verbose: print '--------------item data --------------------' print i print '--------------EOF item data ----------------' if use_as_string: yield {'content': unicode(i)} else: yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break