def print_results_tda(file_name, data, key_word): tda_number_of_records = data[-3:] with codecs.open(file_name, 'r', encoding='utf-8') as f: html = f.read().replace("tda_data", data) html = html.replace("tda_number_of_records", tda_number_of_records) #nombre del archivo file_name = Path(FOLDER_NAME + SLASH + key_word + FILE_LASTNAME + FILE_EXTENSION) if file_name.exists(): try: Path.unlink(file_name) pass except OSError as error: print("Error: {} - {}.".format(error.filename, error.strerror)) else: create_data_files(FOLDER_NAME, key_word + FILE_LASTNAME, html) webbrowser.open(str(file_name), new=1, autoraise=True)
def print_all_results(file_name, key_word, tda_data, poli_data, colma_data): #apertura de archivo base with codecs.open(file_name, 'r', encoding='utf-8') as f: html = f.read() if tda_data: tda_number_of_records = tda_data[-3:] html = html.replace("tda_data", tda_data) html = html.replace("tda_number_of_records", tda_number_of_records) else: html = html.replace("tda_data", "No hay registros") html = html.replace("tda_number_of_records", "0") if poli_data: polijic_number_of_records = poli_data[-3:] html = html.replace("poli_data", poli_data) html = html.replace("polijic_number_of_records", polijic_number_of_records) else: html = html.replace("poli_data", "No hay registros") html = html.replace("polijic_number_of_records", "0") if colma_data: colma_number_of_records = colma_data[-3:] html = html.replace("colma_data", colma_data) html = html.replace("colma_number_of_records", colma_number_of_records) else: html = html.replace("colma_data", "No hay registros") html = html.replace("colma_number_of_records", "0") #nombre del archivo file_name = Path(FOLDER_NAME + SLASH + key_word + FILE_LASTNAME + FILE_EXTENSION) #link xml_link = key_word + XML_LAST_NAME + XML_EXTENSION html = html.replace("xml_data", str(xml_link)) if file_name.exists(): try: Path.unlink(file_name) pass except OSError as error: print("Error: {} - {}.".format(error.filename, error.strerror)) else: create_data_files(FOLDER_NAME, key_word + FILE_LASTNAME, html) webbrowser.open(str(file_name), new=1, autoraise=True)
def crawl_page_for_search(url_to_crawl, key_word, folder_name): try: #validar a que institucion pertenece la url a buscar if 'aleph' in str(url_to_crawl): url_institution = 'POLIJIC' # se hace la peticion GET a la url webpage = requests.get(url_to_crawl, verify=False) elif 'tdea' in str(url_to_crawl): url_institution = 'TDA' # se hace la peticion GET a la url webpage = requests.post(url_to_crawl, verify=False) else: url_institution = 'COLMA' # se hace la peticion GET a la url webpage = requests.post(url_to_crawl, verify=False) content = webpage.text file_name = key_word+'-'+url_institution create_data_files(folder_name, file_name, content) except Exception as e: print(str(e))
def crawl_page(thread_name): url_info = Crawler.fetch_url_info() if url_info is not None: url_to_crawl = url_info[1] url_id = url_info[0] theme_id = url_info[2] url_institution = url_info[3] print(thread_name + ' Crawling ') try: # se hace la peticion GET a la url webpage = requests.get(url_to_crawl, verify=False) # decodificacion segun charset content = str(webpage.content, webpage.headers.get('charset')) file_name = Crawler.get_theme(theme_id) + '-' + url_institution create_data_files(Crawler.folder_name, file_name, content) Crawler.update_url(1, url_id) except Exception as e: print(str(e)) Crawler.update_url(0, url_id) else: print('Error al obtener url')