def nested_items(self, ni): from newsline.helpers import helpers # The initialization case if ni is None: self._nested_items = [] return if not hasattr(self, "_nested_items"): self._nested_items = [] if isinstance(ni, DomItem): self._nested_items.append(ni) elif helpers.is_dict(ni): try: if 'autogen' in ni: self._nested_items.append(WDIAutoGen(ni['name'], ni['url'], ni['nested_items'] if 'nested_items' in ni else None, ni['autogen'], ni['range'] if 'range' in ni else None, ni['parentless'] if 'parentless' in ni else False)) else: self._nested_items.append(DomItem(ni['name'], ni['url'], ni['selector'], ni['nested_items']) if 'nested_items' in ni else DomItem(ni['name'], ni['url'], ni['selector'])) except Exception as e: raise Exception("DomItem nested element exception : %s" % str(e)) elif helpers.is_list(ni): if helpers.is_empty(ni): raise Exception("You cannot supply nested_items as empty") elif all(isinstance(i, DomItem) or isinstance(i, dict) for i in ni): try: self._nested_items.extend([(WDIAutoGen(i['name'], i['url'], i['nested_items'] if 'nested_items' in i else None, i['autogen'], i['range'] if 'range' in i else None, i['parentless'] if 'parentless' in i else False) if 'autogen' in i else DomItem(i['name'], i['url'], i['selector'], i['nested_items'] if 'nested_items' in i else None )) if isinstance(i, dict) else i for i in ni]) except Exception as e: raise Exception("DomItem nested element exception : %s" % str(e))
def getattr_recursive(self, attr, depth=0): if not hasattr(self, attr): raise Exception("DomItem object has no attribute %s", str(attr)) if attr == 'nested_items': raise Exception( "nested_items key is not allowed to be fetched recursively") if self.has_nested_items: if helpers.is_list(self.nested_items): return { "level_%d" % depth: getattr(self, attr), "nested_item_%s" % attr: [ ni.getattr_recursive(attr, depth + 1) for ni in self.nested_items ] } else: return { "level_%d" % depth: getattr(self, attr), "nested_item_%s" % attr: self.nested_items.getattr_recursive(attr, depth + 1) } else: return {"level_%d" % depth: getattr(self, attr)}
def update_level(self, l): self.level = l if self.children: if helpers.is_list(self.children): for child in self.children: child.update_level(l + 1) elif isinstance(self.children, Tree): self.children.update_level(l + 1)
def clean(self, domitems): """ cleans the urls from the double slashes or trailing slashes""" if helpers.is_str(domitems): return self.regexr.remove_double_slash(domitems) elif helpers.is_dict(domitems): return helpers.map_dictionary(self.regexr.remove_double_slash, domitems, "url") elif helpers.is_list(domitems): def _mpdictpart(_didict, _func=self.regexr.remove_double_slash, _key="url"): return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key) return list(map(_mpdictpart, domitems))
def decode(self, domitems): """ turns the utf-8/ISO-8859-I arabic characters to unicode arabic characters""" if helpers.is_str(domitems): return self.regexr.parse_arabic_urls(domitems) elif helpers.is_dict(domitems): return helpers.map_dictionary(self.regexr.parse_arabic_urls, domitems, "url") elif helpers.is_list(domitems): def _mpdictpart(_didict, _func=self.regexr.parse_arabic_urls, _key="url"): return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key) return list(map(_mpdictpart, domitems))
def normalize(self, domitems): """ removes the rooturl from the domitem urls if they have it""" if helpers.is_str(domitems): return self.remove_rooturl(domitems) elif helpers.is_dict(domitems): return helpers.map_dictionary(self.remove_rooturl, domitems, "url") elif helpers.is_list(domitems): def _mpdictpart(_didict, _func=self.remove_rooturl, _key="url"): return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key) return list(map(_mpdictpart, domitems))
def patternize(self): """ This method will extract the regex pattern of the url as to get all similar links.""" from .regexr import RegexrClass self.regexr = RegexrClass(self.url) if self.has_nested_items: if helpers.is_list(self.nested_items): for item in self.nested_items: item.patternize() else: self.nested_items.patternize()
def validate(self, domitems): if helpers.is_list(domitems): if not all(helpers.is_dict(di) for di in domitems): raise Exception("The domitems list expects all elements to be dictionaries, some aren't") else: return domitems else: if not helpers.is_dict(domitems): raise Exception("The domitems expects a dictionary element, %s given" % type(domitems)) else: return domitems
def rooturl(self, url): if not url or url is None: raise Exception("rooturl cannot be empty or None") if helpers.is_str(url): if not helpers.is_url(url): raise Exception("rooturl should respect the form a url e.g: http://google.com\n\t url: %s"% url) if helpers.is_list(url): if helpers.is_empty(url): raise Exception("rooturl list can not be empty") elif not all(helpers.is_str(u) for u in url): raise Exception("rooturl is list, expecting all list elements to be str, however an element (or more) is not") elif not helpers.is_url(url, root=True): raise Exception("rooturl list given, however an element does not respect url pattern. e.g: http://google.com\n\t url: %s"% url) if helpers.is_str(url): self._rooturl = url.strip("/") elif helpers.is_list(url): def _strip(u, dl): return u.strip(dl) self._rooturl = [_strip(u, "/") for u in url]
def domselector(self, ds): if ds is None: raise Exception("domselector cannot be empty or None") if helpers.is_list(ds): if helpers.is_empty(ds): raise Exception( "domselector received an empty list, domselector can not be empty" ) if not all(helpers.is_str(d) for d in ds): raise Exception( "domselector received an empty list, but not all elements are strings" ) elif not helpers.is_str(ds): raise Exception( "domselector is expected to be a string, %s given" % type(ds)) if not helpers.is_str(ds) and not helpers.is_list(ds): raise Exception( "domselector is expected to be a string or list of strings, %s given" % type(ds)) self._domselector = ds
def url(self, url): if not url or url is None: raise Exception("url cannot be empty or None") if helpers.is_list(url): if not all(helpers.is_str(u) for u in url): raise Exception( "url is list, expecting all list elements to be str, however an element (or more) is not" ) elif not all(helpers.is_url(u) for u in url): raise Exception( "url list given, however an element does not respect url pattern. e.g: http://google.com\n\t provided url: %s" % url) self._url = url
def domitems(self, domitems): if helpers.is_list(domitems): if not all(helpers.is_dict(domitem) for domitem in domitems): raise Exception("The domitems list expects all elements to be dictionaries, some aren't") else: for i in domitems: if 'autogen' in i: self._domitems = WDIAutoGen(i['name'], i['url'], i['nested_items'] if 'nested_items' in i else None, i['autogen'], i['range'] if 'range' in i else None, i['parentless'] if 'parentless' in i else False) else: self._domitems = WDomItem(i['name'], i['url'], i['selector'], i['nested_items'] if 'nested_items' in i else None) elif helpers.is_dict(domitems): if 'autogen' in domitems: self._domitems = WDIAutoGen(domitems['name'], domitems['url'], domitems['nested_items'] if 'nested_items' in domitems else None, domitems['autogen'], domitems['range'] if 'range' in domitems else None, domitems['parentless'] if 'parentless' in domitems else False) else: self._domitems = WDomItem(domitems['name'], domitems['url'], domitems['selector'], domitems['nested_items'] if 'nested_items' in domitems else None)
def _launch(self, strength=0, force=False): from functools import partial _potoci = partial(self._pipeout_crawled_item, strength=strength, force=force) if not helpers.is_list(self.domitems): self._pipeout_domitem(self.domitems, "", strength, force) else: for di in self.domitems: self._pipeout_domitem(di, "", strength, force) if not helpers.is_list(self.domitems): if not isinstance(self.domitems.crawled_items, list): self.domitems.crawled_items.diverge(_potoci) else: for ci in self.domitems.crawled_items: ci.diverge(_potoci) else: for di in self.domitems: if not isinstance(di.crawled_items, list): self.domitems.crawled_items.diverge(_potoci) else: for ci in di.crawled_items: ci.diverge(_potoci) return self.jsonify()
def _add(self, xchildren): def _update_level(obj, l): obj.update_level(l) return obj if not helpers.is_list(xchildren): if not isinstance(xchildren, Tree): raise Exception( "Children are expected to be of type Tree, %s given" % type(xchildren)) else: self._children.append(_update_level(xchildren, self.level + 1)) else: if not all(isinstance(c, Tree) for c in xchildren): raise Exception( "Children list is expected to have all elements of type Tree, some aren't" ) else: self._children.extend([ _update_level(child, self.level + 1) for child in xchildren ])
def _summary(self): def grabtrunk(crawleditem): print("grabtrunk for %s" % crawleditem.url) if crawleditem.nested_items: print("has nested items ") def _graball(nesteditems): dictt = {} for ni in nesteditems: dictt.update(grabtrunk(ni)) return dictt return { crawleditem.url: { "type": crawleditem.dom_item.name, "nested_items": _graball(crawleditem.nested_items) } } else: print("does not have nested items ") return { crawleditem.url :{ "type": crawleditem.dom_item.name, "nested_items": "none" } } summary = {} if helpers.is_list(self.domitems): for di in self.domitems: summary.update({ di.name: {}.update(grabtrunk(ci) for ci in di.crawled_items) }) else: for ci in self.domitems.crawled_items: summary.update(grabtrunk(ci)) return summary
def jsonify(self): def grabtrunk(crawleditem): self.log("Grabbing %s for json" % crawleditem.url) if crawleditem.nested_items: def _graball(nesteditems): print("_grablall for %s" % crawleditem.url) dictt = {} for i, ni in enumerate(nesteditems): dictt.update({i: grabtrunk(ni)}) return dictt return { "item_type" : crawleditem.dom_item.name, "item_url" : crawleditem.url, "nested_items" : _graball(crawleditem.nested_items) } else: return { "item_type" : crawleditem.dom_item.name, "item_url" : crawleditem.url, "nested_items" : "none" } dictionary = {} if helpers.is_list(self.domitems): for i, di in enumerate(self.domitems): dictionary.update({ i: {}.update(grabtrunk(ci) for ci in di.crawled_items) }) else: for i, ci in enumerate(self.domitems.crawled_items): dictionary.update({i: grabtrunk(ci)}) self.log("Retuning dictionary.json", color="BOLDYELLOW") return dictionary
def realCaseTest(self): raised = False domitem = None try: domitem = DomItem( 'category_item', '/category/politics', 'nav > ul > li > a', { "name": 'pagination', "url": '/category/politics/page1', "selector": 'div.pagination > ul > li > a', "nested_items": { "name": 'articles', "url": '/article/123123.html', "selector": 'h2 > a' } }) except Exception as e: self.print_failure("Test failed with :%s" % str(e)) self.print_seperator() return self.print_success("Dom Item instantiation successful") self.print_with_color("DARKCYAN", "DomItem name: %s" % domitem.name) self.print_with_color("DARKCYAN", "DomItem url: %s" % domitem.url) self.print_with_color("DARKCYAN", "DomItem selector: %s" % domitem.domselector) self.print_with_color( "DARKCYAN", "DomItem has_nested_items: %s" % domitem.has_nested_items) if domitem.has_nested_items: self.print_success("\tDom Item has nested items") from newsline.helpers import helpers if helpers.is_list(domitem.nested_items): self.print_with_color("DARKCYAN", "\tNested DomItems are many") else: nitem = domitem.nested_items self.print_with_color("DARKCYAN", "\tNested DomItem name: %s" % nitem.name) self.print_with_color("DARKCYAN", "\tNested DomItem url: %s" % nitem.url) self.print_with_color( "DARKCYAN", "\tNested DomItem selector: %s" % nitem.domselector) self.print_with_color( "DARKCYAN", "\tNested DomItem has_nested_items: %s" % nitem.has_nested_items) if nitem.has_nested_items: self.print_success("\t\tNested Dom Item has nested items") from newsline.helpers import helpers if helpers.is_dict(nitem.nested_items): self.print_with_color( "DARKCYAN", "\tNested DomItems nested items are many") else: nnitem = nitem.nested_items self.print_with_color( "DARKCYAN", "\t\tNested DomItem name: %s" % nnitem.name) self.print_with_color( "DARKCYAN", "\t\tNested DomItem url: %s" % nnitem.url) self.print_with_color( "DARKCYAN", "\t\tNested DomItem selector: %s" % nnitem.domselector) self.print_with_color( "DARKCYAN", "\t\tNested DomItem has_nested_items: %s" % nnitem.has_nested_items) self.print_success("Test passed successfully") self.print_seperator()
def patternize(self): if helpers.is_list(self.domitems): for item in self.domitems: item.patternize() elif isinstance(self.domitems, DomItem): self.domitems.patternize()