def list_pdf_v2(configs, save_dir, name_in_url=True, extract_name=False, add_date=False, try_overwite=False, no_overwrite=False, debug=False): if not os.path.exists(save_dir): os.makedirs(save_dir) html_page = requests.get(configs.webpage).text soup = BeautifulSoup(html_page, "html.parser") url_name = [] try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup, configs, extract_name=extract_name) get_files(save_dir, configs.sleep_time, name_in_url=name_in_url, add_date=add_date)
def decrypt_file(gpg: gnupg.GPG, targer_folder: str): configuration_kwargs = utils.get_configuratation( path=os.path.join(targer_folder, 'configuration')) gpg = _get_decryption(gpg=gpg, configuration_kwargs=configuration_kwargs) encrypted_filepaths = utils.get_files( path=os.path.join(targer_folder, 'encrypted_files')) decrypted_path = os.path.join(targer_folder, 'decrypted_files') password = configuration_kwargs.get('array')['passphrase'] \ if settings.ENVIRONMENT == 'prod' else 'wow' for encryped_file in encrypted_filepaths: base_name = __set_file_name(path=encryped_file) logger.info(f'Decrypting file {base_name}') output_file = os.path.join(decrypted_path, base_name) with open(encryped_file, 'rb') as connection: status = gpg.decrypt_file(file=connection, passphrase=password, output=output_file) logger.info(f'Decrypting done') if not status.ok: logger.info(f'{status.stderr}') else: logger.info(f'Status [{status.status}]')
def encrypt_file(gpg: gnupg.GPG, targer_folder: str): configuration_kwargs = utils.get_configuratation( path=os.path.join( targer_folder, 'configuration' ) ) value = _get_decryption(configuration_kwargs=configuration_kwargs) gpg.import_keys(value) keys = gpg.list_keys() gpg.trust_keys( keys.fingerprints, 'TRUST_ULTIMATE' ) decrypted_filepaths = utils.get_files( path=os.path.join( targer_folder, 'decrypted_files' ) ) encrypted_path = os.path.join( targer_folder, 'encrypted_files' ) for decryped_file in decrypted_filepaths: base_name = __set_file_name(path=decryped_file) logger.info(f'encrypting file {base_name}') output_file = os.path.join( encrypted_path, base_name ) with open(decryped_file, 'rb') as connection: status = gpg.encrypt_file( file=connection, recipients=keys.fingerprints, output=output_file, ) logger.info(f'encrypting done') if not status.ok: logger.info(f'{status.stderr}') else: logger.info(f'Status [{status.status}]')
try: name_table = [] for link_2 in soup.findAll("span"): if "hyperlink" in str(link_2.get("class")): name_table.append(link_2.string) name = name_table[0] # print(link) except KeyError: print("KeyError") pass # print("Else " + name ) # name = name[:name.rindex('.')] with open("url_name.txt", "a") as output: if "https" in link["href"]: output.write(url + ", " + name.strip("/") + ".pdf" + "\n") else: # Uncomment following line if domain is not in href, and comment out line above output.write(domain + url + ", " + name.strip("/") + ".pdf" + "\n") print("Done") try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup) get_files(save_dir, sleep_time) # import etl.py
def list_pdf_v3( configs, save_dir, debug=False, delete=True, important=False, try_overwite=False, name_in_url=True, add_date=False, extract_name=False, no_overwrite=False, ): # try_overwite is for get_files if not os.path.exists(save_dir): print(" [*] Making save_dir") os.makedirs(save_dir) print(" [*] Getting webpage and parsing") html_page = requests.get(configs.webpage).text soup = BeautifulSoup(html_page, "html.parser") if delete != False: try: os.remove("url_name.txt") except FileNotFoundError: pass print(" [*] Extracting info.") extract_info(soup, configs, extract_name=extract_name, name_in_url=name_in_url) if not important: print(" [?] important is False, using non_important") non_important = configs.non_important print(" [*] Opening url_name.txt") with open("url_name.txt", "r") as og_file, open( "2url_name.txt", "w" ) as new_file: print(" [*] Adding only important lines to 2url_name.txt") for line in og_file: if not any( non_important in line.lower() for non_important in non_important ): new_file.write(line) # print(" [*] The following lines were not added: " + str(line)) print(" [*] Done writing") else: print(" [?] important is True, assuming important is configured") try: important = configs.important except AttributeError: # print("") print(" [!] Important is still named `non_important`") # print("") important = configs.non_important print(" [*] Opening url_name.txt") with open("url_name.txt", "r") as og_file, open( "2url_name.txt", "w" ) as new_file: print(" [*] Adding lines containing: " + str(important)) for line in og_file: if any(important in line.lower() for important in important): new_file.write(line) print(line) print(" [*] Done writing") if debug != True: try: os.remove("url_name.txt") except FileNotFoundError: pass os.rename("2url_name.txt", "url_name.txt") get_files( save_dir, configs.sleep_time, debug=debug, delete=delete, try_overwite=try_overwite, name_in_url=name_in_url, no_overwrite=no_overwrite, add_date=add_date, )