def update_ip_list() -> "UHBPyFuncebleSystemLauncher": """ Updates the content of the :code:`ip.list` file. """ input_file = FileHelper(outputs.IP_SUBJECTS_DESTINATION) ip_file = FileHelper(outputs.IP_DESTINATION) if input_file.exists(): logging.info("Started generation of %r.", ip_file.path) with input_file.open( "r", encoding="utf-8") as input_file_stream, ip_file.open( "w", encoding="utf-8") as ip_file_stream: for line in input_file_stream: if not line.strip() or line.startswith("#"): continue ip_file_stream.write("\n".join(line.split()[1:]) + "\n") ip_file_stream.write("\n") whitelist_core_tool( output_file=ip_file.path, use_official=True, processes=os.cpu_count(), ).filter(file=ip_file.path, already_formatted=True, standard_sort=False) logging.info("Finished generation of %r.", ip_file.path)
def update_clean_list(self) -> "UHBPyFuncebleSystemLauncher": """ Updates the content of the :code:`clean.list` file. """ input_file = FileHelper(outputs.ACTIVE_SUBJECTS_DESTINATION) clean_file = FileHelper(outputs.CLEAN_DESTINATION) if input_file.exists(): logging.info("Started generation of %r.", clean_file.path) with input_file.open( "r", encoding="utf-8") as input_file_stream, clean_file.open( "w", encoding="utf-8") as clean_file_stream: for line in input_file_stream: line = line.strip() if not line or line.startswith("#") or "." not in line: continue if line.endswith("."): line = line[:-1] clean_file_stream.write("\n" + line) logging.info("Finished generation of %r.", clean_file.path) return self
def update_volatile_list(self) -> "UHBPyFuncebleSystemLauncher": """ Updates the content of the :code:`volatile.list` file. """ input_file = FileHelper(outputs.TEMP_VOLATIVE_DESTINATION) volatile_file = FileHelper(outputs.VOLATILE_DESTINATION) clean_file = FileHelper(outputs.CLEAN_DESTINATION) logging.info("Started generation of %r.", volatile_file.path) with volatile_file.open("w", encoding="utf-8") as volatile_file_stream: if clean_file.exists(): with clean_file.open("r", encoding="utf-8") as clean_file_stream: for line in clean_file_stream: line = line.strip() if not line or line.startswith("#") or "." not in line: continue if line.endswith("."): line = line[:-1] volatile_file_stream.write(line + "\n") if input_file.exists(): with input_file.open("r", encoding="utf-8") as input_file_stream: for line in input_file_stream: line = line.strip() if not line or line.startswith("#") or "." not in line: continue if line.endswith("."): line = line[:-1] volatile_file_stream.write(line + "\n") volatile_file.write("\n") whitelist_core_tool( output_file=volatile_file.path, use_official=True, processes=os.cpu_count(), ).filter(file=volatile_file.path, already_formatted=True, standard_sort=False) logging.info("Finished generation of %r.", volatile_file.path) return self
def test_open(self) -> None: """ Tests the method which let us open the given file as we want. """ file_helper = FileHelper(tempfile.gettempdir()) file_helper.set_path(file_helper.join_path(secrets.token_hex(8))) expected = False actual = file_helper.exists() self.assertEqual(expected, actual) with file_helper.open("w") as file_stream: file_stream.write("Hello, World!") expected = True actual = file_helper.exists() self.assertEqual(expected, actual) expected = "Hello, World!" actual = file_helper.read() self.assertEqual(expected, actual)
def run_end(self): """ Run the end logic. """ self.info_manager["currently_under_test"] = False self.info_manager["latest_part_finish_datetime"] = datetime.utcnow() self.info_manager["latest_part_finish_timestamp"] = self.info_manager[ "latest_part_finish_datetime"].timestamp() self.info_manager["finish_datetime"] = self.info_manager[ "latest_part_finish_datetime"] self.info_manager["finish_timestamp"] = self.info_manager[ "finish_datetime"].timestamp() logging.info( "Updated all timestamps and indexes that needed to be updated.") pyfunceble_active_list = FileHelper( os.path.join( self.info_manager.WORKSPACE_DIR, "output", dead_hosts.launcher.defaults.paths.ORIGIN_FILENAME, "domains", "ACTIVE", "list", )) clean_list = [ "# File generated by the Dead-Hosts project with the help of PyFunceble.", "# Dead-Hosts: https://github.com/dead-hosts", "# PyFunceble: https://pyfunceble.github.io", f"# Generation Time: {datetime.utcnow().isoformat()}", ] logging.info( f"PyFunceble ACTIVE list output: {pyfunceble_active_list.path}") if pyfunceble_active_list.exists(): logging.info( f"{pyfunceble_active_list.path} exists, getting and formatting its content." ) self.output_file.write("\n".join(clean_list) + "\n\n", overwrite=True) with pyfunceble_active_list.open("r", encoding="utf-8") as file_stream: for line in file_stream: if line.startswith("#"): continue self.output_file.write(line) self.output_file.write("\n") logging.info("Updated of the content of %r", self.output_file.path)
def migrate(self) -> "MigratorBase": """ Provides the migrator (itself). """ file_helper = FileHelper(self.source_file) if file_helper.exists(): with file_helper.open("r", encoding="utf-8") as file_stream: first_line = next(file_stream) if any(x in first_line for x in self.TO_DELETE): temp_destination = tempfile.NamedTemporaryFile( "a+", newline="", encoding="utf-8", delete=False ) file_handler = file_helper.open(newline="") reader = csv.DictReader(file_handler) writer = csv.DictWriter( temp_destination, fieldnames=[x for x in self.FIELDS if x not in self.TO_DELETE], ) writer.writeheader() keys_found = False for row in reader: row = dict(row) for key in self.TO_DELETE: if key in row: del row[key] keys_found = True if not keys_found: break writer.writerow(row) if self.print_action_to_stdout: print_single_line() temp_destination.seek(0) FileHelper(temp_destination.name).move(self.source_file) self.done = True
def get_csv_writer(self) -> Tuple[csv.DictWriter, open]: """ Provides the standard and initiated CSV Dict writer along with the file that was open with it. """ file_helper = FileHelper(self.source_file) add_header = not file_helper.exists() file_handler = file_helper.open("a+", newline="") writer = csv.DictWriter(file_handler, fieldnames=self.FIELDS) if add_header: writer.writeheader() return writer, file_handler
def produce_diff(self) -> None: """ Produce the difference from teh downloaded file. """ file_helper = FileHelper(self.final_destination) new = set() kept = set() removed = set() if file_helper.exists(): with file_helper.open("r", encoding="utf-8") as file_stream: current_content = set(x.strip() for x in file_stream) else: current_content = set() downloaded_empty = True for line in self.download_temp_file: if downloaded_empty: downloaded_empty = False line = line.strip() if not line: continue kept_kept, new_new = self.__get_diff_data( current_content, get_subjects_from_line(line, "availability")) new.update(new_new) kept.update(kept_kept) if downloaded_empty: kept = current_content else: compare_base = kept.copy() compare_base.update(new) removed = current_content - compare_base self.download_temp_file.seek(0) return kept, removed, new
def csv_file_delete_source_column_target( continuous_integration: ContinuousIntegrationBase, ) -> None: """ Provides the target for the deletion of the source column. """ migrator = InactiveDatasetDeleteSourceColumnMigrator( print_action_to_stdout=True ) migrator.continuous_integration = continuous_integration file_helper = FileHelper(migrator.source_file) if file_helper.exists(): with file_helper.open("r", encoding="utf-8") as file_stream: first_line = next(file_stream) if any(x in first_line for x in migrator.TO_DELETE): print( f"{colorama.Fore.MAGENTA}{colorama.Style.BRIGHT}" "Started deletion of the 'source' column into " f"{migrator.source_file!r}." ) migrator.start() if migrator.done: print( f"{colorama.Fore.GREEN}{colorama.Style.BRIGHT}" "Finished deletion of the 'source' column into " f"{migrator.source_file!r}." ) else: print( f"{colorama.Fore.MAGENTA}{colorama.Style.BRIGHT}" "unfinished deletion of the 'source' column into " f"{migrator.source_file!r}." ) else: PyFunceble.facility.Logger.info( "Stopped csv_file_delete_source_column_target. File does not exist." )
def get_content(self) -> open: """ Provides a file handler which does let you read the content line by line. :raise FileNotFoundError: When the declared file does not exists. """ file_helper = FileHelper(self.source_file) if not file_helper.exists() and bool( self.DOWNLOADER): # pragma: no cover ## pragma reason: Safety. self.DOWNLOADER.start() if not file_helper.exists(): raise FileNotFoundError(file_helper.path) return file_helper.open("r", encoding="utf-8")
def get_content(self) -> Generator[Optional[dict], None, None]: """ Provides a generator which provides the next line to read. """ file_helper = FileHelper(self.source_file) if file_helper.exists(): file_handler = file_helper.open(newline="") reader = csv.DictReader(file_handler) for row in reader: if "tested_at" in row: try: row["tested_at"] = datetime.fromisoformat( row["tested_at"]) except (TypeError, ValueError): row["tested_at"] = datetime.utcnow() - timedelta( days=365) yield row file_handler.close()
def migrate(self) -> "InactiveJSON2CSVMigrator": """ Starts the migration. """ file_helper = FileHelper(self.source_file) if file_helper.exists(): self.dataset.set_authorized(True) dataset = { "idna_subject": None, "status": None, "status_source": None, "checker_type": "AVAILABILITY", "destination": None, "source": None, "tested_at": None, "session_id": None, } delete_file = True with file_helper.open("r", encoding="utf-8") as file_stream: for line in file_stream: if (self.continuous_integration and self.continuous_integration.is_time_exceeded()): delete_file = False break line = (line.strip().replace('"', "").replace(",", "").replace( "{", "", ).replace("}", "")) if ":" not in line: continue index, value = [x.strip() for x in line.rsplit(":", 1)] if not value: if index.isdigit(): dataset[ "tested_at"] = datetime.datetime.fromtimestamp( float(index)).isoformat() else: dataset["source"] = os.path.abspath(index) dataset[ "destination"] = get_destination_from_origin( dataset["source"]) continue dataset["idna_subject"] = domain2idna.domain2idna(index) dataset["status"] = value if not dataset["tested_at"]: dataset["tested_at"] = datetime.datetime.utcnow( ).isoformat() PyFunceble.facility.Logger.debug("Decoded dataset:\n%r.", dataset) self.dataset.update(dataset) if self.print_action_to_stdout: print_single_line() PyFunceble.facility.Logger.info("Added %r into %r", dataset["idna_subject"], self.dataset) if delete_file: file_helper.delete() self.done = True return self
def start(self, print_dots: bool = False) -> "FilePreloader": """ Starts the pre-loading of the currently set file path. """ self.__load_description() broken = False file_helper = FileHelper(self.protocol["subject"]) self.__description[ self.__matching_index]["hash"] = HashHelper().hash_file( file_helper.path) if isinstance(self.continue_dataset, CSVContinueDataset): self.continue_dataset.set_base_directory( self.protocol["output_dir"]) if (self.__description[self.__matching_index]["checker_type"] != self.protocol["checker_type"] or self.__description[self.__matching_index]["subject_type"] != self.protocol["subject_type"]): try: self.continue_dataset.cleanup() except TypeError: self.continue_dataset.cleanup( session_id=self.protocol["session_id"]) if (self.__description[self.__matching_index]["previous_hash"] and self.__description[self.__matching_index]["hash"] != self.__description[self.__matching_index]["previous_hash"]): # Forces the reading of each lines because there is literally no # way to know where something has been changed. self.__description[self.__matching_index]["line_number"] = 1 if (self.__description[self.__matching_index]["checker_type"] != self.protocol["checker_type"] or self.__description[self.__matching_index]["subject_type"] != self.protocol["subject_type"] or self.__description[self.__matching_index]["hash"] != self.__description[self.__matching_index]["previous_hash"]): try: with file_helper.open("r", encoding="utf-8") as file_stream: line_num = 1 for line in file_stream: if (line_num < self.__description[ self.__matching_index]["line_number"]): line_num += 1 continue if (self.continuous_integration and self.continuous_integration.is_time_exceeded() ): broken = True break line = line.strip() if self.rpz_policy2subject and "SOA" in line: self.rpz_policy2subject.set_soa(line.split()[0]) for subject in get_subjects_from_line( line, self.checker_type, adblock_inputline2subject=self. adblock_inputline2subject, wildcard2subject=self.wildcard2subject, rpz_policy2subject=self.rpz_policy2subject, rpz_inputline2subject=self. rpz_inputline2subject, inputline2subject=self.inputline2subject, subject2complements=self.subject2complements, url2netloc=self.url2netloc, cidr2subject=self.cidr2subject, ): to_send = copy.deepcopy(self.protocol) to_send["subject"] = subject to_send["idna_subject"] = domain2idna(subject) to_send["tested_at"] = datetime.utcnow( ) - timedelta(days=365.25 * 20) if self.inactive_dataset.exists(to_send): print_single_line("I") continue if TesterWorker.should_be_ignored( subject=to_send["idna_subject"]): print_single_line("X") continue self.continue_dataset.update(to_send, ignore_if_exist=True) if print_dots: print_single_line() self.__description[ self.__matching_index]["line_number"] += 1 line_num += 1 except KeyboardInterrupt as exception: self.__save_description() raise exception if not broken: self.__description[ self.__matching_index]["previous_hash"] = self.__description[ self.__matching_index]["hash"] self.__save_description() return self
def migrate(self) -> "WhoisJSON2CSVMigrator": """ Provides the migration logic. """ file_helper = FileHelper(self.source_file) if file_helper.exists(): self.dataset.set_authorized(True) dataset = { "subject": None, "idna_subject": None, "expiration_date": None, "epoch": None, } delete_file = True with file_helper.open("r", encoding="utf-8") as file_stream: for line in file_stream: if (self.continuous_integration and self.continuous_integration.is_time_exceeded()): delete_file = False break line = (line.strip().replace('"', "").replace(",", "").replace( "{", "", ).replace("}", "")) if ":" not in line: continue index, value = [x.strip() for x in line.split(":")] if not value: dataset["subject"], dataset["idna_subject"] = ( index, domain2idna.domain2idna(index), ) continue if index == "epoch": dataset["epoch"] = float(value) elif index == "expiration_date": dataset["expiration_date"] = value elif index == "state": PyFunceble.facility.Logger.debug( "Decoded dataset:\n%r.", dataset) self.dataset.update(dataset) if self.print_action_to_stdout: print_single_line() PyFunceble.facility.Logger.info( "Added %r into %r", dataset["idna_subject"], self.dataset) if delete_file: file_helper.delete() self.done = True return self
def process_file_sorting( cls, file: str, remove_duplicates: bool = True, write_header: bool = True, sorting_key: Any = None, ) -> None: """ Process the sorting of the given file. The idea is to split the file piece by piece and at the end join all sorted files. For that job, we create a temporary directory which will store the temporary files. :param file: The file to sort. :param remove_duplicates: Activates the deletion of duplicates. :param write_header: Activates the writing of the PyFunceble related header. .. warning:: When this is set to :py:class:`True`, we assume that the header itself was already given. Meaning that the first 2 commented lines will be excluded from the sorting and regenerated. :param sorting_key: The sorting key to apply while sorting. This is the lambda/function that goes into the :code:`key` argument of the :py:class:`sorted` function. """ # pylint: disable=too-many-locals,too-many-statements def merge_files( files: List[TextIOWrapper], ) -> Generator[Tuple[List[TextIOWrapper]], str, None]: """ Merges the given files and yield each "lines" of the merged file. :param files: The files to merge. """ result = [] for index, file in enumerate(files): try: iterator = iter(file) value = next(iterator) heapq.heappush( result, ((sorting_key(value), index, value, iterator, file))) except StopIteration: file.close() previous = None comment_count = 0 max_comment_count = 2 while result: ignore = False _, index, value, iterator, file = heapq.heappop(result) if remove_duplicates and value == previous: ignore = True if (write_header and comment_count < max_comment_count and value[0] == "#"): ignore = True max_comment_count += 1 if not ignore: yield value previous = value try: value = next(iterator) heapq.heappush( result, ((sorting_key(value), index, value, iterator, file))) except StopIteration: file.close() temp_directory = tempfile.TemporaryDirectory() temporary_output_file = os.path.join(temp_directory.name, secrets.token_hex(6)) if not sorting_key: sorting_key = get_best_sorting_key() file_helper = FileHelper(file) sorted_files = [] PyFunceble.facility.Logger.info("Started sort of %r.", file) with file_helper.open("r", encoding="utf-8", buffering=cls.FILE_BUFFER_SIZE) as file_stream: while True: to_sort = list(islice(file_stream, cls.MAX_LINES)) if not to_sort: break new_file = open( os.path.join(temp_directory.name, secrets.token_hex(6)), "w+", encoding="utf-8", buffering=cls.FILE_BUFFER_SIZE, ) new_file.writelines( ListHelper(to_sort).remove_duplicates().custom_sort( key_method=sorting_key).subject) new_file.flush() new_file.seek(0) sorted_files.append(new_file) with open(temporary_output_file, "w", cls.FILE_BUFFER_SIZE, encoding="utf-8") as file_stream: if write_header: file_stream.write(FilePrinter.STD_FILE_GENERATION) file_stream.write(FilePrinter.get_generation_date_line()) file_stream.write("\n\n") file_stream.writelines(merge_files(sorted_files)) FileHelper(temporary_output_file).move(file) PyFunceble.facility.Logger.info("Finished sort of %r.", file) temp_directory.cleanup()