Пример #1
0
	def nested_items(self, ni):
		from newsline.helpers import helpers

		# The initialization case
		if ni is None:
			self._nested_items = []
			return 

		if not hasattr(self, "_nested_items"): self._nested_items = []

		if isinstance(ni, DomItem): self._nested_items.append(ni)
		elif helpers.is_dict(ni):
			try:
				if 'autogen' in ni:
					self._nested_items.append(WDIAutoGen(ni['name'], ni['url'], ni['nested_items'] if 'nested_items' in ni else None, ni['autogen'], ni['range'] if 'range' in ni else None, ni['parentless'] if 'parentless' in ni else False))
				else:
					self._nested_items.append(DomItem(ni['name'], ni['url'], ni['selector'], ni['nested_items']) if 'nested_items' in ni else DomItem(ni['name'], ni['url'], ni['selector']))
			except Exception as e:
				raise Exception("DomItem nested element exception : %s" % str(e))

		elif helpers.is_list(ni):
			if helpers.is_empty(ni): raise Exception("You cannot supply nested_items as empty")
			elif all(isinstance(i, DomItem) or isinstance(i, dict) for i in ni):
				try:
					self._nested_items.extend([(WDIAutoGen(i['name'], i['url'], i['nested_items'] if 'nested_items' in i else None, i['autogen'], i['range'] if 'range' in i else None, i['parentless'] if 'parentless' in i else False) if 'autogen' in i else DomItem(i['name'], i['url'], i['selector'], i['nested_items'] if 'nested_items' in i else None )) if isinstance(i, dict) else i for i in ni]) 
				except Exception as e:
					raise Exception("DomItem nested element exception : %s" % str(e))
Пример #2
0
 def getattr_recursive(self, attr, depth=0):
     if not hasattr(self, attr):
         raise Exception("DomItem object has no attribute %s", str(attr))
     if attr == 'nested_items':
         raise Exception(
             "nested_items key is not allowed to be fetched recursively")
     if self.has_nested_items:
         if helpers.is_list(self.nested_items):
             return {
                 "level_%d" % depth:
                 getattr(self, attr),
                 "nested_item_%s" % attr: [
                     ni.getattr_recursive(attr, depth + 1)
                     for ni in self.nested_items
                 ]
             }
         else:
             return {
                 "level_%d" % depth:
                 getattr(self, attr),
                 "nested_item_%s" % attr:
                 self.nested_items.getattr_recursive(attr, depth + 1)
             }
     else:
         return {"level_%d" % depth: getattr(self, attr)}
Пример #3
0
 def update_level(self, l):
     self.level = l
     if self.children:
         if helpers.is_list(self.children):
             for child in self.children:
                 child.update_level(l + 1)
         elif isinstance(self.children, Tree):
             self.children.update_level(l + 1)
Пример #4
0
	def clean(self, domitems):
		""" cleans the urls from the double slashes or trailing slashes"""
		if helpers.is_str(domitems): return self.regexr.remove_double_slash(domitems)
		elif helpers.is_dict(domitems):
			return helpers.map_dictionary(self.regexr.remove_double_slash, domitems, "url")
		elif helpers.is_list(domitems):
			def _mpdictpart(_didict, _func=self.regexr.remove_double_slash, _key="url"):
				return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key)

			return list(map(_mpdictpart, domitems))
Пример #5
0
	def decode(self, domitems):
		""" turns the utf-8/ISO-8859-I arabic characters to unicode arabic characters"""
		if helpers.is_str(domitems): return self.regexr.parse_arabic_urls(domitems)
		elif helpers.is_dict(domitems):
			return helpers.map_dictionary(self.regexr.parse_arabic_urls, domitems, "url")
		elif helpers.is_list(domitems):
			def _mpdictpart(_didict, _func=self.regexr.parse_arabic_urls, _key="url"):
				return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key)

			return list(map(_mpdictpart, domitems))
Пример #6
0
	def normalize(self, domitems):
		""" removes the rooturl from the domitem urls if they have it"""
		if helpers.is_str(domitems): return self.remove_rooturl(domitems)
		elif helpers.is_dict(domitems):
			return helpers.map_dictionary(self.remove_rooturl, domitems, "url")
		elif helpers.is_list(domitems):
			def _mpdictpart(_didict, _func=self.remove_rooturl, _key="url"):
				return helpers.map_dictionary(func=_func, dictionary=_didict, key=_key)

			return list(map(_mpdictpart, domitems))
Пример #7
0
    def patternize(self):
        """ This method will extract the regex pattern of the url as to get all similar links."""
        from .regexr import RegexrClass
        self.regexr = RegexrClass(self.url)

        if self.has_nested_items:
            if helpers.is_list(self.nested_items):
                for item in self.nested_items:
                    item.patternize()
            else:
                self.nested_items.patternize()
Пример #8
0
	def validate(self, domitems):
		if helpers.is_list(domitems):
			if not all(helpers.is_dict(di) for di in domitems):
				raise Exception("The domitems list expects all elements to be dictionaries, some aren't")
			else:
				return domitems
		else:
			if not helpers.is_dict(domitems):
				raise Exception("The domitems expects a dictionary element, %s given" % type(domitems))
			else:
				return domitems
Пример #9
0
	def rooturl(self, url):
		if not url or url is None: raise Exception("rooturl cannot be empty or None")
		if helpers.is_str(url): 
			if not helpers.is_url(url): 
				raise Exception("rooturl should respect the form a url e.g: http://google.com\n\t url: %s"% url)
		if helpers.is_list(url):
			if helpers.is_empty(url):
				raise Exception("rooturl list can not be empty")
			elif not all(helpers.is_str(u) for u in url):
				raise Exception("rooturl is list, expecting all list elements to be str, however an element (or more) is not")
			elif not helpers.is_url(url, root=True):
				raise Exception("rooturl list given, however an element does not respect url pattern. e.g: http://google.com\n\t url: %s"% url)

		if helpers.is_str(url):
			self._rooturl = url.strip("/")
		elif helpers.is_list(url):
			def _strip(u, dl):
				return u.strip(dl)

			self._rooturl = [_strip(u, "/") for u in url]
Пример #10
0
    def domselector(self, ds):
        if ds is None: raise Exception("domselector cannot be empty or None")
        if helpers.is_list(ds):
            if helpers.is_empty(ds):
                raise Exception(
                    "domselector received an empty list, domselector can not be empty"
                )
            if not all(helpers.is_str(d) for d in ds):
                raise Exception(
                    "domselector received an empty list, but not all elements are strings"
                )
        elif not helpers.is_str(ds):
            raise Exception(
                "domselector is expected to be a string, %s given" % type(ds))

        if not helpers.is_str(ds) and not helpers.is_list(ds):
            raise Exception(
                "domselector is expected to be a string or list of strings, %s given"
                % type(ds))

        self._domselector = ds
Пример #11
0
    def url(self, url):
        if not url or url is None:
            raise Exception("url cannot be empty or None")
        if helpers.is_list(url):
            if not all(helpers.is_str(u) for u in url):
                raise Exception(
                    "url is list, expecting all list elements to be str, however an element (or more) is not"
                )
            elif not all(helpers.is_url(u) for u in url):
                raise Exception(
                    "url list given, however an element does not respect url pattern. e.g: http://google.com\n\t provided url: %s"
                    % url)

        self._url = url
Пример #12
0
	def domitems(self, domitems):
		if helpers.is_list(domitems):
			if not all(helpers.is_dict(domitem) for domitem in domitems):
				raise Exception("The domitems list expects all elements to be dictionaries, some aren't")
			else:
				for i in domitems:
					if 'autogen' in i:
						self._domitems = WDIAutoGen(i['name'], i['url'], i['nested_items'] if 'nested_items' in i else None, i['autogen'], i['range'] if 'range' in i else None, i['parentless'] if 'parentless' in i else False)
					else:
						self._domitems = WDomItem(i['name'], i['url'], i['selector'], i['nested_items'] if 'nested_items' in i else None)
		elif helpers.is_dict(domitems):
			if 'autogen' in domitems:
				self._domitems = WDIAutoGen(domitems['name'], domitems['url'], domitems['nested_items'] if 'nested_items' in domitems else None, domitems['autogen'], domitems['range'] if 'range' in domitems else None, domitems['parentless'] if 'parentless' in domitems else False)
			else:
				self._domitems = WDomItem(domitems['name'], domitems['url'], domitems['selector'], domitems['nested_items'] if 'nested_items' in domitems else None)
Пример #13
0
	def _launch(self, strength=0, force=False):
		from functools import partial
		_potoci = partial(self._pipeout_crawled_item, strength=strength, force=force)

		if not helpers.is_list(self.domitems):
			self._pipeout_domitem(self.domitems, "", strength, force)
		else:
			for di in self.domitems: self._pipeout_domitem(di, "", strength, force)
	
		if not helpers.is_list(self.domitems):
			if not isinstance(self.domitems.crawled_items, list):
				self.domitems.crawled_items.diverge(_potoci)
			else:
				for ci in self.domitems.crawled_items:
					ci.diverge(_potoci)
		else:
			for di in self.domitems:
				if not isinstance(di.crawled_items, list):
					self.domitems.crawled_items.diverge(_potoci)
				else:
					for ci in di.crawled_items:
						ci.diverge(_potoci)

		return self.jsonify()
Пример #14
0
    def _add(self, xchildren):
        def _update_level(obj, l):
            obj.update_level(l)
            return obj

        if not helpers.is_list(xchildren):
            if not isinstance(xchildren, Tree):
                raise Exception(
                    "Children are expected to be of type Tree, %s given" %
                    type(xchildren))
            else:
                self._children.append(_update_level(xchildren, self.level + 1))
        else:
            if not all(isinstance(c, Tree) for c in xchildren):
                raise Exception(
                    "Children list is expected to have all elements of type Tree, some aren't"
                )
            else:
                self._children.extend([
                    _update_level(child, self.level + 1) for child in xchildren
                ])
Пример #15
0
	def _summary(self):
		def grabtrunk(crawleditem):
			print("grabtrunk for %s" % crawleditem.url)
			if crawleditem.nested_items:
				print("has nested items ")
				def _graball(nesteditems):
					dictt = {}
					for ni in nesteditems:
						dictt.update(grabtrunk(ni))
					return dictt

				return {
					crawleditem.url: {
						"type": crawleditem.dom_item.name,
						"nested_items": _graball(crawleditem.nested_items)
					}
				}
			else:
				print("does not have nested items ")
				return { 
					crawleditem.url :{
						"type": crawleditem.dom_item.name,
						"nested_items": "none"
					}
				}

		summary = {}
		if helpers.is_list(self.domitems):
			for di in self.domitems:
				summary.update({
						di.name: {}.update(grabtrunk(ci) for ci in di.crawled_items)
					})
		else:
			for ci in self.domitems.crawled_items:
				summary.update(grabtrunk(ci))

		return summary
Пример #16
0
	def jsonify(self):
		def grabtrunk(crawleditem):
			self.log("Grabbing %s for json" % crawleditem.url)
			if crawleditem.nested_items:
				def _graball(nesteditems):
					print("_grablall for %s" % crawleditem.url)
					dictt = {}
					for i, ni in enumerate(nesteditems):
						dictt.update({i: grabtrunk(ni)})
					return dictt

				return {
					"item_type"    : crawleditem.dom_item.name,
					"item_url"     : crawleditem.url,
					"nested_items" : _graball(crawleditem.nested_items)
				}

			else:
				return { 
					"item_type"    : crawleditem.dom_item.name,
					"item_url"     : crawleditem.url,
					"nested_items" : "none"
				}

		dictionary = {}
		if helpers.is_list(self.domitems):
			for i, di in enumerate(self.domitems):
				dictionary.update({
						i: {}.update(grabtrunk(ci) for ci in di.crawled_items)
					})
		else:
			for i, ci in enumerate(self.domitems.crawled_items):
				dictionary.update({i: grabtrunk(ci)})

		self.log("Retuning dictionary.json", color="BOLDYELLOW")			
		return dictionary
Пример #17
0
    def realCaseTest(self):
        raised = False
        domitem = None
        try:
            domitem = DomItem(
                'category_item', '/category/politics', 'nav > ul > li > a', {
                    "name": 'pagination',
                    "url": '/category/politics/page1',
                    "selector": 'div.pagination > ul > li > a',
                    "nested_items": {
                        "name": 'articles',
                        "url": '/article/123123.html',
                        "selector": 'h2 > a'
                    }
                })
        except Exception as e:
            self.print_failure("Test failed with :%s" % str(e))
            self.print_seperator()
            return

        self.print_success("Dom Item instantiation successful")

        self.print_with_color("DARKCYAN", "DomItem name: %s" % domitem.name)
        self.print_with_color("DARKCYAN", "DomItem url: %s" % domitem.url)
        self.print_with_color("DARKCYAN",
                              "DomItem selector: %s" % domitem.domselector)
        self.print_with_color(
            "DARKCYAN",
            "DomItem has_nested_items: %s" % domitem.has_nested_items)

        if domitem.has_nested_items:
            self.print_success("\tDom Item has nested items")
            from newsline.helpers import helpers
            if helpers.is_list(domitem.nested_items):
                self.print_with_color("DARKCYAN", "\tNested DomItems are many")
            else:
                nitem = domitem.nested_items
                self.print_with_color("DARKCYAN",
                                      "\tNested DomItem name: %s" % nitem.name)
                self.print_with_color("DARKCYAN",
                                      "\tNested DomItem url: %s" % nitem.url)
                self.print_with_color(
                    "DARKCYAN",
                    "\tNested DomItem selector: %s" % nitem.domselector)
                self.print_with_color(
                    "DARKCYAN", "\tNested DomItem has_nested_items: %s" %
                    nitem.has_nested_items)

                if nitem.has_nested_items:
                    self.print_success("\t\tNested Dom Item has nested items")
                    from newsline.helpers import helpers
                    if helpers.is_dict(nitem.nested_items):
                        self.print_with_color(
                            "DARKCYAN",
                            "\tNested DomItems nested items are many")
                    else:
                        nnitem = nitem.nested_items
                        self.print_with_color(
                            "DARKCYAN",
                            "\t\tNested DomItem name: %s" % nnitem.name)
                        self.print_with_color(
                            "DARKCYAN",
                            "\t\tNested DomItem url: %s" % nnitem.url)
                        self.print_with_color(
                            "DARKCYAN", "\t\tNested DomItem selector: %s" %
                            nnitem.domselector)
                        self.print_with_color(
                            "DARKCYAN",
                            "\t\tNested DomItem has_nested_items: %s" %
                            nnitem.has_nested_items)

                self.print_success("Test passed successfully")
                self.print_seperator()
Пример #18
0
	def patternize(self):
		if helpers.is_list(self.domitems):
			for item in self.domitems:
				item.patternize()
		elif isinstance(self.domitems, DomItem):
			self.domitems.patternize()