def make_sentence(text): """ Cambia el texto a formato de oración: Cambia todas los tokens delimitados por *'.'* o *'\\\\n'* a Mayuscula minuscula """ check_if_any_type(text, [str, str]) i = 0 while True: while i < len(text) and not text[i].isalpha(): i+=1 if i >= len(text): break text[i] = text[i].upper() while i < len(text): if text[i] in ".\n": i+=1 break else: text[i].lower() i+=1 return text
def get_frame(self, url = None, name = None): """ Devuelve un objeto Frame hijo del browser con una url dada o nombre. Este frame puede ser usado para buscar elementos, etc sin salir de la página. Si se tiene una página web como esta: .. code-block:: html <html><body> <iframe name="feisbuc" src="http://www.facebook.com/"/> </body></html> Se puede hacer lo siguiente para buscar algo en el frame: :: # obtiene el frame de facebook # o bien frame = browser.get_frame(url='http://www.facebook.com/') frame = browser.get_frame(name='feisbuc') # busca un div en el frame de facebook print frame.find(text_contains = 'te ayuda').innerHTML # imprime Facebook te ayuda a comunicarte y compartir con las personas que conoces. """ check_if_any_type(url, [str, NoneType]) check_if_any_type(name, [str, NoneType]) return self.get_main_frame().get_frame(url, name)
def run(self): "Método base para correr el parser" res = 1 try: res = self._run() check_if_any_type(res, [int, int, bool, NoneType]) if res == None: res = 0 else: res = int(res) except: traceback.print_exc() res = 1 finally: self.__context.logger.mark_end() l_postactions = len(self.__context.postactions.postactions) if l_postactions > 0: for i, p in enumerate(self.__context.postactions.postactions): self.log("Corriendo postaction %s %d/%d" % ( p.get_name(), i+1, l_postactions)) try: p.run(self.__context, res) except: print("Error al correr postaction", file=sys.stderr) traceback.print_exc() self.log("Parser terminado") return res
def __init__(self, url, parse = None, navigate = None, end_on_error = True, priority = None, **extra_attributes ): self.__url = url m, f = self.__get_func(parse) self.__parse_is_method = m self.__parse_func = [ f ] m, f = self.__get_func(navigate) self.__navigate_is_method = m self.__navigate_func = [ f ] # chequea y asigna prioridad a la página check_if_any_type(priority, (Integral, NoneType)) if priority == None: priority = self.PRIORITY_MIN if priority not in list(range(self.PRIORITY_MAX, self.PRIORITY_MIN+1)): raise ValueError("priority must be beetween %d and %d" % ( self.PRIORITY_MAX, self.PRIORITY_MIN)) self.__priority = priority self.__parser_instance = None self.__end_on_error = bool(end_on_error) # setea los atributos extra for attribute, value in list(extra_attributes.items()): if not hasattr(self, attribute): setattr(self, attribute, value)
def add_page(self, page_handler_parent_or_id, page): "Agrega una página como hija de cierta página" with self.__lock: check_if_any_type(page, Page) if page_handler_parent_or_id == None: parent_id = -1 elif isinstance(page_handler_parent_or_id, PageHandler): parent_id = page_handler_parent_or_id.id elif isinstance(page_handler_parent_or_id, int): parent_id = page_handler_parent_or_id else: raise ValueError("page_handler_parent_or_id has invalid type") conn = self.__get_connection() cursor = conn.cursor() serialized = copy(page) serialized._set_parser_instance(None) serialized = dumps(serialized) cursor.execute(""" INSERT INTO pages (url, parent, state, priority, page_data) VALUES (?, ?, ?, ?, ?) """, (page.url, parent_id, PageHandler.PageState.Pending, page.priority, serialized) ) conn.commit() # avisa a los escuchas page_handler = PageHandler(cursor.lastrowid, page, parent_id, self) with self.__add_page_listeners_lock: for l in self.__add_page_listeners: l.on_add_page(self, page_handler)
def __init__(self, host, port): check_if_any_type(host, str) check_if_any_type(port, [int,int]) BaseProxy.__init__(self) self.__host = host self.__port = port
def __init__(self, db_path, start_func, parser): PageManagerBase.__init__(self) check_if_any_type(db_path, str) check_if_any_type(start_func, (NoneType, FunctionType, MethodType)) self.__parser = parser self.__db_path = db_path self.__connections_lock = Lock() #self.__connections = {} self.__connection = None self.__add_page_listeners_lock = Lock() self.__add_page_listeners = [] self.__lock = Lock() if start_func != None: # si se tiene que empezar con una base de datos nueva, borra el archivo if os.path.exists(db_path): os.remove(db_path) self.__init_db(start_func) else: if not os.path.exists(db_path): msg = "Continue last option specified, but '%s' was not found " msg += "to recover items" raise ValueError(msg % db_path) # chequea que exista la base de datos self.__prepare_db()
def remove_all_html_tags(self, text): """ Igual que :func:`remove_html_tags` pero saca todos los tags HTML """ check_if_any_type(text, [str, str]) text = re.compile('<\/?.*?\/?>', re.U).sub('', text) return text
def __init__(self, context, parser): from .page_parser import PageParser check_if_any_type(parser, PageParser) self.__parser = parser self.__context = context self.log = self.__parser.log self.get_paralel_num = self.__parser.get_paralel_num self.__page_manager = None
def __get_func(self, func): check_if_any_type(func, (NoneType, FunctionType, MethodType)) if isinstance(func, FunctionType): return False, func elif isinstance(func, MethodType): # instancemethod check_if_any_type(func.__self__, PageParser) return True, func.__func__.__name__ else: return False, None
def subscribe_preload_page(self, subscriber): """ Permite suscribirse a un evento de carga de la p£gina usando la clase PreloadPageListener """ from sdf.browsers.browser import PreloadPageListener check_if_any_type(subscriber, PreloadPageListener) with self.__preload_page_subscribers_lock: self.__preload_page_subscribers.append(subscriber)
def remove_html_tags(self, text, tags): """ Saca todos los tags indicados html indicados del texto. Reemplaza cada ocurrencia de un elemento que tiene los tags por el texto que contiene """ check_if_any_type(text, [str, str]) for tag in tags: check_if_any_type(tag, [str, str]) text = re.compile('<\/?%s\/?>' % tag, re.U).sub('', text) return text
def __init__(self, logger, encoding, logging_level = None): from sdf import Logger check_if_any_type(logger, Logger) check_if_any_type(logging_level, (NoneType, int, int)) self.__logger = logger self.__pending_line = "" if logging_level == None: logging_level = Logger.MessageLevel.Info self.__logging_level = logging_level self.encoding = encoding
def write(self, data, color = None, bold = False): """ Escribe caracteres en la salida de la consola, le da formato y manda la señal a __write_data_slot para que escriba """ check_if_any_type(color, [ NoneType, str, str ]) if color == None: color = 'black' self.emit(QtCore.SIGNAL('write_signal(QString, QString, bool)'), str(data), color, bold)
def __init__(self, to, project = None): PostAction.__init__(self) if not hasattr(to, '__iter__'): to = [ to ] for t in to: if not isinstance(t, str): raise ValueError("to debe ser una lista de strings o un string") check_if_any_type(project, [NoneType, str]) self.__project = project self.__to = to
def clean_text(self, text, delete_enters=True): """ Quita todos los ``'\\r'`` de un texto, todos los espacios en blanco repetidos, todos las tabulaciones. Si *delete_enters* es **True** también borra todos los ``'\\n'`` """ check_if_any_type(text, [str, str]) text = self.__re_enters2.sub('', text) if (delete_enters) : text = self.__re_enters.sub(' ', text) else: text = self.__re_enters.sub('\n', text) text = self.__re_tabs.sub('', text) text = self.__re_whitespaces.sub(' ', text) text = text.strip() return text
def parse_range(date_str, patterns, default_year=None): """ Igual que :func:`parse` pero parsea una fecha que comprenda un rango de dos fechas, Ejemplo :: patterns = '<d> al <d> de <m> del <y>' DateHelper.parse_range('3 al 4 de septiembre del 2010', patterns) devuelve una lista de objetos *datetime* con valores [ 3/9/2010, 4/9/2010 ] """ check_if_any_type(date_str, [str, str]) if isinstance(patterns, str) or isinstance(patterns, str): patterns = [patterns] if len(patterns) == 0: raise ValueError("At least one pattern must be defined") if default_year == None: default_year = datetime.now().year for pattern in patterns: pattern = DateHelper.__create_pattern(pattern) match = re.search(pattern, date_str, re.S | re.M | re.I) if not match: continue groups = match.groupdict() day0 = groups["day0"] day1 = groups.get("day1", day0) month0 = groups["month0"] month1 = groups.get("month1", month0) year0 = groups.get("year0") year1 = groups.get("year1", year0) day0, month0, year0 = DateHelper.__prepare_day_month_year(day0, month0, year0, default_year) day1, month1, year1 = DateHelper.__prepare_day_month_year(day1, month1, year1, default_year) return (datetime(year0, month0, day0), datetime(year1, month1, day1)) else: raise ValueError("Invalid date: " + date_str)
def clean_html(self, text): """ Limpia un código HTML, realizando los siguientes pasos #. borra comentarios HTML <!-- --> #. borra el cabezal <head></head> #. borra los elementos <script></script> #. borra los atributos del html Por ejemplo :: html = \""" <html> <head><title>Algo</title></head> <body><div name="limpiame">Texto del div</div> <script type="javascript">var a = 1</script> </body></html> \""" print self.text_helper.clean_html(html) Da como resultado .. code-block:: html <html> <body><div>Texto del div</div> </body></html> """ check_if_any_type(text, [str, str]) text = self.__extract_html_comments.sub( self.__replace_html_comments, text) text = self.__extract_html_head.sub(self.__replace_html_head, text) text = self.__extract_html_scripts.sub( self.__replace_html_scripts, text) text = self.__extract_html_tag_attributes1.sub( self.__replace_tag_attributes1, text) text = self.__extract_html_tag_attributes2.sub( self.__replace_tag_attributes2, text) return text
def set_proxy(self, host = "", port = 80): "Pone el proxy HTTP en cierto host y puerto dado" check_if_any_type(host, str) check_if_any_type(port, [ str, int, int ]) if isinstance(port, str): port = int(port) if not host: self.__proxy = {} else: self.__proxy = { 'http' : "%s:%d" % (host, port), 'https' : "%s:%d" % (host, port) } self.__reset_opener()
def parse(date_str, patterns, default_year=None): """ Parsea una fecha según patrones dados y retorna un objeto de tipo datetime. Los patrones pueden tener tener los siguientes contenidos <m> coincide con el mes (su nombre o su representación numérica) <y> coincide con el año (un número) <d> coincide con el día (un número) <*> coincide con espacios los otros caracteres se toman como tales Un ejemplo es :: patterns = '<d> de <m> del <y>' DateHelper.parse_range('3 de septiembre del 2010', patterns) devuelve un objeto *datetime* con el valor 3/9/2010 """ check_if_any_type(date_str, [str, str]) if isinstance(patterns, str) or isinstance(patterns, str): patterns = [patterns] if len(patterns) == 0: raise ValueError("At least one pattern must be defined") if default_year == None: default_year = datetime.now().year for pattern in patterns: pattern = DateHelper.__create_pattern(pattern) match = re.search(pattern, date_str, re.S | re.M | re.I) if not match: continue groups = match.groupdict() day, month, year = DateHelper.__prepare_day_month_year( groups["day0"], groups["month0"], groups.get("year0"), default_year ) return datetime(year, month, day) else: raise ValueError("Invalid date: " + date_str)
def download(self, url, destination, dir = False): """ Descarga un archivo de la url dada a un destino dado creando el directorio donde descargar el archivo. Si dir es True destination indica el directorio a guardar y el nombre del archivo se escoje a partir de la url. Si dir es False destination se trata como el nombre y se guarda con el nombre especificado. """ # los caracteres a reemplazar en los nombres de archivo en windows windows_forbidden_chars = '\:*<>|?"/' check_if_any_type(destination, str) check_if_any_type(url, str) downloaded_data = self._download(url) if dir: parts = urlparse(url) file_name = parts[2].split('/')[-1] # el último path # reemplaza caracteres prohibidos for char in windows_forbidden_chars: file_name = file_name.replace(char, '-') destination += os.path.sep + file_name # crea el directorio padre dirname = os.path.dirname(destination) if dirname == '': pass # no se crea si el directorio es el actual elif os.path.exists(dirname): if not os.path.isdir(dirname): raise ValueError("file already exists, but not a directory") else: os.makedirs(dirname) local = open(destination, "wb") local.write(downloaded_data) local.close()
def set_processed_from(self, min_item_num, processed): """ Pone todos los item desde el item <min_item_num> con el atributo processed con un valor dado """ check_if_any_type(min_item_num, int) if processed: processed = True else: processed = False with self.__lock: conn = self.__get_connection() conn.execute(""" UPDATE raw_items SET processed = ? WHERE num >= ? """,(processed, min_item_num)) conn.commit() conn.close()
def __init__(self, id, item, parent): check_if_any_type(id, (int, int)) self.__id = id check_if_any_type(parent, (PageHandler)) self.__parent = parent check_if_any_type(item, Item) self.__item = item
def __init__(self, buf, url, code, msg, headers): check_if_any_type(buf, bytes) check_if_any_type(url, str) check_if_integral(code) check_if_any_type(msg, str) check_if_any_type(headers, dict) urllib.request.addinfourl.__init__( self, BytesIO(buf), _AboutProtocolHeader(headers), url, code ) self.msg = msg
def Enum(*names, **namevals): """ Representa una clase de enumerados algunos ejemplos son: e = Enum('a', 'b', 'c') e = Enum('a', 'b', c = 2) e = Enum( 'a', b = 1, c = 2, d = 1, effa = 0 ) e = Enum( eggs = 0, ham = 1, spam = 2, nuts = 45 ) """ assert names or namevals, "Empty enums are not supported" enum_type = _EnumClass() for i, each in enumerate(names): check_if_any_type(each, str) enum_type._add_value(each, i) for enum_name, enum_val in list(namevals.items()): enum_type._add_value(enum_name, enum_val) return enum_type
def __init__(self, yield_func, conn_factory, sql_items, count_sql): check_if_any_type(conn_factory, _ConnFactory) check_if_any_type(sql_items, str) check_if_any_type(count_sql, str) self.__conn_factory = conn_factory self.__sql_items = sql_items self.__count_sql = count_sql self.__yield_func = yield_func self.__length = self.__get_length()
def __init__(self, id, page, parent, page_manager): check_if_any_type(id, (int, int)) self.__id = id check_if_any_type(parent, (NoneType, PageHandler, int, int)) if parent == None: self.__parent = -1 elif isinstance(parent, PageHandler): self.__parent = parent.id else: self.__parent = parent check_if_any_type(page_manager, DefaultPageManager) self.__page_manager = page_manager check_if_any_type(page, Page) self.__page = page self.__page._set_parser_instance(page_manager.get_parser()) self.__page_state = self.PageState.Pending
def execute(self, args=None, options = None, context = None, bootstrap_event_handler = None): "Hace la ejecución de un parser según las opciones que se pasaron" if options == None: options = Options() if context == None: context = Context(options) check_if_any_type(bootstrap_event_handler, [NoneType, BootstrapEventHandler]) check_if_any_type(options, Options) check_if_any_type(context, Context) self.__run(context, options, args, bootstrap_event_handler)
def __init__(self, url, method = 'GET', body = '', headers = {}): check_if_any_type(url, str) check_if_any_type(body, (str, dict)) check_if_any_type(headers, dict) if method not in self.SupportedMethods: raise ValueError("Method must be one of the following: " % \ ', '.join(self.SupportedMethods)) if isinstance(body, str): self.__body = body else: self.__body = urlencode(body) self.__url = url self.__method = method self.__headers = headers
def _set_page_parser_driver(self, driver): "Setea el Driver para ejecutar el PageParser" from sdf import BasePageParserDriver check_if_any_type(driver, BasePageParserDriver) self.__page_parser_driver = driver