Пример #1
0
	def make_sentence(text):
		"""
		Cambia el texto a formato de oración: Cambia todas los tokens
		delimitados por *'.'* o *'\\\\n'* a Mayuscula minuscula
		"""
		check_if_any_type(text, [str, str])
		
		i = 0
		while True:
			while i < len(text) and not text[i].isalpha():
				i+=1
			if i >= len(text):
				break

			text[i] = text[i].upper()
				
			while i < len(text):
				if text[i] in ".\n":
					i+=1
					break
				else:
					text[i].lower()
					i+=1
		
		return text
Пример #2
0
	def get_frame(self, url = None, name = None):
		"""
		Devuelve un objeto Frame hijo del browser con una url dada o nombre.
		Este frame puede ser usado para buscar elementos, etc sin salir de la
		página. Si se tiene una página web como esta:
		
		.. code-block:: html
			
				<html><body>
					<iframe name="feisbuc" src="http://www.facebook.com/"/>
				</body></html>

		Se puede hacer lo siguiente para buscar algo en el frame: ::
		
			# obtiene el frame de facebook
			# o bien frame = browser.get_frame(url='http://www.facebook.com/')
			frame = browser.get_frame(name='feisbuc')
								
			# busca un div en el frame de facebook
			print frame.find(text_contains = 'te ayuda').innerHTML
			# imprime Facebook te ayuda a comunicarte y compartir con las personas que conoces.
		
		"""
		check_if_any_type(url, [str, NoneType])
		check_if_any_type(name, [str, NoneType])
		return self.get_main_frame().get_frame(url, name)
Пример #3
0
	def run(self):
		"Método base para correr el parser"
		res = 1
		try:
			res = self._run()
			
			check_if_any_type(res, [int, int, bool, NoneType])
			if res == None:
				res = 0
			else:
				res = int(res)
		except:
			traceback.print_exc()
			res = 1
		finally:
			self.__context.logger.mark_end()
			
			l_postactions = len(self.__context.postactions.postactions) 
			if l_postactions > 0:
				for i, p in enumerate(self.__context.postactions.postactions):
					self.log("Corriendo postaction %s %d/%d" % (
						p.get_name(), i+1, l_postactions))
					
					try:
						p.run(self.__context, res)
					except:
						print("Error al correr postaction", file=sys.stderr)
						traceback.print_exc()
						
				self.log("Parser terminado")
			
		return res
Пример #4
0
	def __init__(self,
				url,
				parse = None,
				navigate = None,
				end_on_error = True,
				priority = None,
				**extra_attributes
				):
		self.__url = url
		
		m, f = self.__get_func(parse)
		self.__parse_is_method = m
		self.__parse_func = [ f ]
		m, f = self.__get_func(navigate)
		self.__navigate_is_method = m
		self.__navigate_func = [ f ] 
		
		# chequea y asigna prioridad a la página
		check_if_any_type(priority, (Integral, NoneType))
		if priority == None:
			priority = self.PRIORITY_MIN
		if priority not in list(range(self.PRIORITY_MAX, self.PRIORITY_MIN+1)):
			raise ValueError("priority must be beetween %d and %d" % (
				self.PRIORITY_MAX, self.PRIORITY_MIN))
		self.__priority = priority 
				
		self.__parser_instance = None
		
		self.__end_on_error = bool(end_on_error)
		
		# setea los atributos extra
		for attribute, value in list(extra_attributes.items()):
			if not hasattr(self, attribute):
				setattr(self, attribute, value)
Пример #5
0
	def add_page(self, page_handler_parent_or_id, page):
		"Agrega una página como hija de cierta página"
		with self.__lock:
			check_if_any_type(page, Page)
	
			if page_handler_parent_or_id == None:
				parent_id = -1
			elif isinstance(page_handler_parent_or_id, PageHandler):
				parent_id = page_handler_parent_or_id.id
			elif isinstance(page_handler_parent_or_id, int):
				parent_id = page_handler_parent_or_id
			else:
				raise ValueError("page_handler_parent_or_id has invalid type") 
	
			conn = self.__get_connection()
			cursor = conn.cursor() 
			
			serialized = copy(page)
			serialized._set_parser_instance(None)
			serialized = dumps(serialized)
			
			cursor.execute("""
				INSERT INTO pages	(url, parent, state, priority, page_data)
				VALUES				(?, ?, ?, ?, ?)
			""", 
				(page.url, parent_id, PageHandler.PageState.Pending,
				 page.priority, serialized)
			)
			conn.commit()

		# avisa a los escuchas
		page_handler = PageHandler(cursor.lastrowid, page, parent_id, self)
		with self.__add_page_listeners_lock:
			for l in self.__add_page_listeners:
				l.on_add_page(self, page_handler)
Пример #6
0
	def __init__(self, host, port):
		check_if_any_type(host, str)
		check_if_any_type(port, [int,int])
		
		BaseProxy.__init__(self)
		self.__host = host
		self.__port = port
Пример #7
0
	def __init__(self, db_path, start_func, parser):
		PageManagerBase.__init__(self)
		
		check_if_any_type(db_path, str)
		check_if_any_type(start_func, (NoneType, FunctionType, MethodType))
		
		self.__parser = parser
		
		self.__db_path = db_path
			
		self.__connections_lock = Lock()
		#self.__connections = {}
		self.__connection = None	
		
		self.__add_page_listeners_lock = Lock()
		self.__add_page_listeners = []
		
		self.__lock = Lock()
		
		if start_func != None:
			# si se tiene que empezar con una base de datos nueva, borra el archivo
			if os.path.exists(db_path):
				os.remove(db_path)
			self.__init_db(start_func)
		else:
			if not os.path.exists(db_path):
				msg = "Continue last option specified, but '%s' was not found "
				msg += "to recover items"
				raise ValueError(msg % db_path)
			# chequea que exista la base de datos
			self.__prepare_db()
Пример #8
0
	def remove_all_html_tags(self, text):
		"""
		Igual que :func:`remove_html_tags` pero saca todos los tags HTML
		"""
		check_if_any_type(text, [str, str])
		
		text = re.compile('<\/?.*?\/?>', re.U).sub('', text)		
		return text
Пример #9
0
	def __init__(self, context, parser):
		from .page_parser import PageParser
		check_if_any_type(parser, PageParser)
		self.__parser = parser
		self.__context = context
		
		self.log = self.__parser.log
		self.get_paralel_num = self.__parser.get_paralel_num
		
		self.__page_manager = None
Пример #10
0
	def __get_func(self, func):
		check_if_any_type(func, (NoneType, FunctionType, MethodType))
		
		if isinstance(func, FunctionType):
			return False, func 
		elif isinstance(func, MethodType): # instancemethod
			check_if_any_type(func.__self__, PageParser)
			return True, func.__func__.__name__
		else:
			return False, None
Пример #11
0
	def subscribe_preload_page(self, subscriber):
		"""
		Permite suscribirse a un evento de carga de la p£gina usando la clase
		PreloadPageListener
		"""
		from sdf.browsers.browser import PreloadPageListener
		check_if_any_type(subscriber, PreloadPageListener)
		
		with self.__preload_page_subscribers_lock:
			self.__preload_page_subscribers.append(subscriber)
Пример #12
0
	def remove_html_tags(self, text, tags):
		"""
		Saca todos los tags indicados html indicados del texto. Reemplaza cada 
		ocurrencia de un elemento que tiene los tags por el texto que contiene		
		"""
		check_if_any_type(text, [str, str])

		for tag in tags:
			check_if_any_type(tag, [str, str])
			text = re.compile('<\/?%s\/?>'  % tag, re.U).sub('', text)
		return text
Пример #13
0
	def __init__(self, logger, encoding, logging_level = None):
		from sdf import Logger
		check_if_any_type(logger, Logger)
		check_if_any_type(logging_level, (NoneType, int, int))
		self.__logger = logger
		self.__pending_line = ""
		
		if logging_level == None:
			logging_level = Logger.MessageLevel.Info
		self.__logging_level = logging_level
		
		self.encoding = encoding
Пример #14
0
	def write(self, data, color = None, bold = False):
		"""
		Escribe caracteres en la salida de la consola, le da formato y 
		manda la señal a __write_data_slot para que escriba
		"""
		check_if_any_type(color, [ NoneType, str, str ])
		
		if color == None:
			color = 'black'

		self.emit(QtCore.SIGNAL('write_signal(QString, QString, bool)'),
				str(data), color, bold)
Пример #15
0
	def __init__(self, to, project = None):
		PostAction.__init__(self)
		
		if not hasattr(to, '__iter__'):
			to = [ to ]
		for t in to:
			if not isinstance(t, str):
				raise ValueError("to debe ser una lista de strings o un string")
			
		check_if_any_type(project, [NoneType, str])
		
		self.__project = project
		self.__to = to
Пример #16
0
	def clean_text(self, text, delete_enters=True):
		"""
		Quita todos los ``'\\r'`` de un texto, todos los espacios en blanco
		repetidos, todos las tabulaciones. Si *delete_enters* es **True** 
		también borra todos los ``'\\n'`` 
		"""
		check_if_any_type(text, [str, str])
		
		text = self.__re_enters2.sub('', text)
		if (delete_enters) :
			text = self.__re_enters.sub(' ', text)
		else:
			text = self.__re_enters.sub('\n', text) 
		text = self.__re_tabs.sub('', text)
		text = self.__re_whitespaces.sub(' ', text)
		text = text.strip()
		return text
Пример #17
0
    def parse_range(date_str, patterns, default_year=None):
        """
		Igual que :func:`parse` pero parsea una fecha que comprenda un
		rango de dos fechas,
		
		Ejemplo ::
		
			patterns = '<d> al <d> de <m> del <y>'
			DateHelper.parse_range('3 al 4 de septiembre del 2010', patterns)
			
		devuelve una lista de objetos *datetime* con valores 
		[ 3/9/2010, 4/9/2010 ]
			
		"""
        check_if_any_type(date_str, [str, str])

        if isinstance(patterns, str) or isinstance(patterns, str):
            patterns = [patterns]
        if len(patterns) == 0:
            raise ValueError("At least one pattern must be defined")

        if default_year == None:
            default_year = datetime.now().year

        for pattern in patterns:
            pattern = DateHelper.__create_pattern(pattern)
            match = re.search(pattern, date_str, re.S | re.M | re.I)
            if not match:
                continue

            groups = match.groupdict()
            day0 = groups["day0"]
            day1 = groups.get("day1", day0)
            month0 = groups["month0"]
            month1 = groups.get("month1", month0)
            year0 = groups.get("year0")
            year1 = groups.get("year1", year0)

            day0, month0, year0 = DateHelper.__prepare_day_month_year(day0, month0, year0, default_year)
            day1, month1, year1 = DateHelper.__prepare_day_month_year(day1, month1, year1, default_year)

            return (datetime(year0, month0, day0), datetime(year1, month1, day1))
        else:
            raise ValueError("Invalid date: " + date_str)
Пример #18
0
	def clean_html(self, text):
		"""
		Limpia un código HTML, realizando los siguientes pasos
		
		#. borra comentarios HTML <!-- -->
		#. borra el cabezal <head></head>
		#. borra los elementos <script></script>
		#. borra los atributos del html
		
		Por ejemplo ::
		
			html = \"""
				<html>
				<head><title>Algo</title></head>
				<body><div name="limpiame">Texto del div</div>
				<script type="javascript">var a = 1</script>
				</body></html> 
			\"""
			print self.text_helper.clean_html(html)
		
		Da como resultado
		
		.. code-block:: html
		
			<html>
			
			<body><div>Texto del div</div>
			
			</body></html> 
			
		"""
		check_if_any_type(text, [str, str])
		
		text = self.__extract_html_comments.sub(
										self.__replace_html_comments, text)
		text = self.__extract_html_head.sub(self.__replace_html_head, text)
		text = self.__extract_html_scripts.sub(
											self.__replace_html_scripts, text)
		text = self.__extract_html_tag_attributes1.sub(
										self.__replace_tag_attributes1, text)
		text = self.__extract_html_tag_attributes2.sub(
										self.__replace_tag_attributes2, text)
		return text
Пример #19
0
	def set_proxy(self, host = "", port = 80):
		"Pone el proxy HTTP en cierto host y puerto dado"

		check_if_any_type(host, str)
		check_if_any_type(port, [ str, int, int ])
		
		if isinstance(port, str):
			port = int(port)
				
		if not host:
			self.__proxy = {}
		else:		
			self.__proxy = {
						'http' : "%s:%d" % (host, port),
						'https' : "%s:%d" % (host, port)
						}
		
		
		self.__reset_opener()
Пример #20
0
    def parse(date_str, patterns, default_year=None):
        """
		Parsea una fecha según patrones dados y retorna un objeto de tipo
		datetime.
		Los patrones pueden tener tener los siguientes contenidos
		<m> coincide con el mes (su nombre o su representación numérica)
		<y> coincide con el año (un número)
		<d> coincide con el día (un número)
		<*> coincide con espacios
		los otros caracteres se toman como tales
		
		Un ejemplo es :: 
	
			patterns = '<d> de <m> del <y>'
			DateHelper.parse_range('3 de septiembre del 2010', patterns)
			
		devuelve un objeto *datetime* con el valor 3/9/2010
		"""
        check_if_any_type(date_str, [str, str])

        if isinstance(patterns, str) or isinstance(patterns, str):
            patterns = [patterns]
        if len(patterns) == 0:
            raise ValueError("At least one pattern must be defined")

        if default_year == None:
            default_year = datetime.now().year

        for pattern in patterns:
            pattern = DateHelper.__create_pattern(pattern)
            match = re.search(pattern, date_str, re.S | re.M | re.I)
            if not match:
                continue

            groups = match.groupdict()
            day, month, year = DateHelper.__prepare_day_month_year(
                groups["day0"], groups["month0"], groups.get("year0"), default_year
            )

            return datetime(year, month, day)

        else:
            raise ValueError("Invalid date: " + date_str)
Пример #21
0
	def download(self, url, destination, dir = False):
		"""
		Descarga un archivo de la url dada a un destino dado creando el
		directorio donde descargar el archivo.
		
		Si dir es True destination indica el directorio a guardar y el nombre
		del archivo se escoje a partir de la url.
		
		Si dir es False destination se trata como el nombre y se guarda con el 
		nombre especificado.
		"""
		# los caracteres a reemplazar en los nombres de archivo en windows
		windows_forbidden_chars = '\:*<>|?"/' 
		
		check_if_any_type(destination, str)
		check_if_any_type(url, str)
		
		downloaded_data = self._download(url)
		
		if dir:
			parts = urlparse(url)
			file_name = parts[2].split('/')[-1] # el último path
			
			# reemplaza caracteres prohibidos
			for char in windows_forbidden_chars: 
				file_name = file_name.replace(char, '-')
			
			destination += os.path.sep + file_name

		# crea el directorio padre
		dirname = os.path.dirname(destination)
		if dirname == '':
			pass # no se crea si el directorio es el actual
		elif os.path.exists(dirname): 
			if not os.path.isdir(dirname):
				raise ValueError("file already exists, but not a directory")
		else:
			os.makedirs(dirname)
			
		local = open(destination, "wb")
		local.write(downloaded_data)
		local.close()
Пример #22
0
	def set_processed_from(self, min_item_num, processed):
		"""
		Pone todos los item desde el item <min_item_num> con el atributo
		processed con un valor dado
		"""
		check_if_any_type(min_item_num, int)
		if processed:
			processed = True
		else:
			processed = False
				
		with self.__lock:
			conn = self.__get_connection()
						
			conn.execute("""
			UPDATE	raw_items
			SET		processed = ?
			WHERE	num >= ?
			""",(processed, min_item_num))
			conn.commit()
			conn.close()
Пример #23
0
	def __init__(self, id, item, parent):
		check_if_any_type(id, (int, int))
		self.__id = id
		check_if_any_type(parent, (PageHandler))
		self.__parent = parent
		check_if_any_type(item, Item)
		self.__item = item	
Пример #24
0
	def __init__(self, buf, url, code, msg, headers):
		check_if_any_type(buf, bytes)
		check_if_any_type(url, str)
		check_if_integral(code)
		check_if_any_type(msg, str)
		check_if_any_type(headers, dict)
				
		urllib.request.addinfourl.__init__(
								self,
								BytesIO(buf),
								_AboutProtocolHeader(headers),
								url,
								code
								)

		self.msg = msg
Пример #25
0
def Enum(*names, **namevals):
	"""
	Representa una clase de enumerados
	
	algunos ejemplos son:
	e = Enum('a', 'b', 'c')
	e = Enum('a', 'b', c = 2)
	
	e = Enum(
			'a',
			b = 1,
			c = 2,
			d = 1,
			effa = 0
			)
	
	e = Enum(
			eggs = 0,
			ham = 1,
			spam = 2,
			nuts = 45
	)
	
	"""
	assert names or namevals, "Empty enums are not supported"
	
	enum_type = _EnumClass() 
	
	for i, each in enumerate(names):
		check_if_any_type(each, str)
		enum_type._add_value(each, i)
		
	for enum_name, enum_val in list(namevals.items()):
		enum_type._add_value(enum_name, enum_val)
		
	return enum_type
Пример #26
0
	def __init__(self, yield_func, conn_factory, sql_items, count_sql):
		check_if_any_type(conn_factory, _ConnFactory)
		check_if_any_type(sql_items, str)
		check_if_any_type(count_sql, str)
		
		self.__conn_factory = conn_factory
		self.__sql_items = sql_items
		self.__count_sql = count_sql
		self.__yield_func = yield_func
		self.__length = self.__get_length()
Пример #27
0
	def __init__(self, id, page, parent, page_manager):
		check_if_any_type(id, (int, int))
		self.__id = id
		
		check_if_any_type(parent, (NoneType, PageHandler, int, int))
		if parent == None:
			self.__parent =	-1
		elif isinstance(parent, PageHandler):
			self.__parent = parent.id
		else:
			self.__parent = parent
			
		check_if_any_type(page_manager, DefaultPageManager)
		self.__page_manager = page_manager
		
		check_if_any_type(page, Page)
		self.__page = page
		self.__page._set_parser_instance(page_manager.get_parser())
				
		self.__page_state = self.PageState.Pending
Пример #28
0
	def execute(self,
			args=None,
			options = None,
			context = None,
			bootstrap_event_handler = None):
		"Hace la ejecución de un parser según las opciones que se pasaron"
		if options == None:
			options = Options()
		if context == None:
			context = Context(options)
		
		check_if_any_type(bootstrap_event_handler, [NoneType, BootstrapEventHandler])
		check_if_any_type(options, Options)
		check_if_any_type(context, Context)

		self.__run(context, options, args, bootstrap_event_handler)
Пример #29
0
	def __init__(self, url, method = 'GET', body = '', headers = {}):
		check_if_any_type(url, str)
		check_if_any_type(body, (str, dict))
		check_if_any_type(headers, dict)
				
		if method not in self.SupportedMethods:
			raise ValueError("Method must be one of the following: " % \
					', '.join(self.SupportedMethods))
		
		if isinstance(body, str):
			self.__body = body
		else:
			self.__body = urlencode(body)
		
		self.__url = url
		self.__method = method
		
		self.__headers = headers
Пример #30
0
	def _set_page_parser_driver(self, driver):
		"Setea el Driver para ejecutar el PageParser"
		from sdf import BasePageParserDriver
		check_if_any_type(driver, BasePageParserDriver)
		self.__page_parser_driver = driver