Пример #1
0
class Loops(object):

    continue_processing = False
    processing_started = False
    lock = asyncio.Lock()

    def __init__(self):
        self.use_es = False
        self.config = Config()
        self.status = Status()
        self.storage = Storage()
        self.hash_json = Hash_Json()
        self.events = Events_Log(self.config.hd2_status_location)
        self.events_elastic = Events_Log_Elastic()
        self.hash = None
        self.report_elastic = Report_Elastic()
        self.analysis_elastic = Analysis_Elastic()
        self.report_elastic.setup()
        self.analysis_elastic.setup()
        create_folder(self.storage.hd2_processed())
        create_folder(self.storage.hd2_not_processed())

    def IsProcessing(self):
        return Loops.processing_started

    def StopProcessing(self):
        Loops.continue_processing = False

    def HasBeenStopped(self):
        return not Loops.continue_processing

    def git_commit(self):
        git_commit = 'Not available'
        try:
            git_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD'
                                                  ]).decode("utf-8").rstrip()
        except Exception as e:
            pass

        return git_commit

    def ProcessDirectoryWithEndpoint(self, itempath, file_hash,
                                     endpoint_index):

        if not os.path.isdir(itempath):
            return False

        log_info(
            message=
            f"Starting ProcessDirectoryWithEndpoint on endpoint # {endpoint_index} for file {file_hash}"
        )
        meta_service = Metadata_Service()
        original_file_path = meta_service.get_original_file_paths(itempath)
        events = Events_Log(itempath)

        endpoint = "http://" + self.config.endpoints['Endpoints'][
            endpoint_index]['IP'] + ":" + self.config.endpoints['Endpoints'][
                endpoint_index]['Port']
        events.add_log("Processing with: " + endpoint)

        meta_service.set_f2f_plugin_version(itempath, API_VERSION)
        meta_service.set_f2f_plugin_git_commit(itempath, self.git_commit())

        try:
            file_processing = File_Processing(events, self.events_elastic,
                                              self.report_elastic,
                                              self.analysis_elastic,
                                              meta_service)
            if not file_processing.processDirectory(endpoint, itempath):
                events.add_log("CANNOT be processed")
                return False

            log_data = {
                'file': original_file_path,
                'status': FileStatus.COMPLETED,
                'error': 'none',
                'timestamp': datetime.now(),
            }
            log_info('ProcessDirectoryWithEndpoint', data=log_data)
            meta_service.set_error(itempath, "none")
            meta_service.set_status(itempath, FileStatus.COMPLETED)
            self.hash_json.update_status(file_hash, FileStatus.COMPLETED)
            events.add_log("Has been processed")
            return True
        except Exception as error:
            log_data = {
                'file': original_file_path,
                'status': FileStatus.FAILED,
                'error': str(error),
            }
            log_error(message='error in ProcessDirectoryWithEndpoint',
                      data=log_data)
            meta_service.set_error(itempath, str(error))
            meta_service.set_status(itempath, FileStatus.FAILED)
            self.hash_json.update_status(file_hash, FileStatus.FAILED)
            events.add_log("ERROR:" + str(error))
            return False

    def ProcessDirectory(self, thread_data):
        (itempath, file_hash, process_index) = thread_data
        endpoint_index = process_index % self.config.endpoints_count
        if not Loops.continue_processing:
            return False
        tik = datetime.now()
        process_result = self.ProcessDirectoryWithEndpoint(
            itempath, file_hash, endpoint_index)

        if process_result:
            self.status.add_completed()

            tok = datetime.now()
            delta = tok - tik

            meta_service = Metadata_Service()
            meta_service.set_hd2_to_hd3_copy_time(itempath,
                                                  delta.total_seconds())
        else:
            self.status.add_failed()

        return process_result

        # note: removing retries from this method (it should not be handled like this
        #for idx in range(self.config.endpoints_count):
        #    if self.ProcessDirectoryWithEndpoint(itempath, file_hash, endpoint_index):
        #        return
        #    # The Endpoint failed to process the file
        #    # Retry it with the next one
        #    endpoint_index = (endpoint_index + 1) % self.config.endpoints_count

    def updateHashJson(self):
        self.hash_json.reset()
        meta_service = Metadata_Service()

        for hash_folder in os.listdir(self.storage.hd2_data()):

            metadata_folder = self.storage.hd2_data(hash_folder)

            if not os.path.isdir(metadata_folder):
                continue

            metadata = meta_service.get_from_file(metadata_folder)
            file_name = metadata.get_file_name()
            original_hash = metadata.get_original_hash()
            status = metadata.get_rebuild_status()

            if status != FileStatus.COMPLETED:
                self.hash_json.add_file(original_hash, file_name)

        self.hash_json.save()
        self.status.set_processing_counters(len(self.hash_json.data()))
        return self.hash_json.data()

    def moveProcessedFiles(self):
        json_list = self.hash_json.data()

        for key in json_list:

            source_path = self.storage.hd2_data(key)

            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                destination_path = self.storage.hd2_processed(key)

                if folder_exists(destination_path):
                    folder_delete_all(destination_path)

                shutil.move(source_path, destination_path)

            if (FileStatus.FAILED == json_list[key]["file_status"]):

                meta_service = Metadata_Service()
                meta_service.get_from_file(source_path)
                metadata = meta_service.metadata
                if ("Engine response could not be decoded" == metadata.get_error()) and \
                    metadata.get_original_file_extension() in ['.xml', '.json']:
                    destination_path = self.storage.hd2_not_processed(key)

                    if folder_exists(destination_path):
                        folder_delete_all(destination_path)

                    shutil.move(source_path, destination_path)

    def LoopHashDirectoriesInternal(self, thread_count, do_single):

        if folder_exists(self.storage.hd2_data()) is False:
            log_message = "ERROR: rootdir does not exist: " + self.storage.hd2_data(
            )
            log_error(log_message)
            return False

        if not isinstance(thread_count, int):
            raise TypeError("thread_count must be a integer")

        if not isinstance(do_single, bool):
            raise TypeError("thread_count must be a integer")

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        json_list = self.updateHashJson()

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        threads = list()

        process_index = 0

        log_info(
            message=f'before Mapping thread_data for {len(json_list)} files')
        thread_data = []
        for key in json_list:
            file_hash = key

            itempath = self.storage.hd2_data(key)
            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                self.events.add_log(
                    f"The file processing has been already completed")
                continue

            if not os.path.exists(itempath):
                self.events.add_log(
                    f"ERROR: Path \"{itempath}\" does not exist")
                json_list[key]["file_status"] = FileStatus.FAILED
                continue

            process_index += 1
            thread_data.append((
                itempath,
                file_hash,
                process_index,
            ))
            # # limit the number of parallel threads
            #
            # if process_index % int(thread_count) == 0:                      # todo: refactor this workflow to use multiprocess and queues
            #     # Clean up the threads
            #     for index, thread in enumerate(threads):                    # todo: since at the moment this will block allocating new threads until
            #         thread.join()                                           #       all have finishing execution
            #
            # process_index += 1
            # log_info(message=f"in LoopHashDirectoriesInternal process_index={process_index} , thread #{process_index % int(thread_count) }")
            # x = threading.Thread(target=self.ProcessDirectory, args=(itempath, file_hash, process_index,))
            # threads.append(x)
            # x.start()
            #
            # if do_single:
            #     break
            #
            # if not Loops.continue_processing:
            #     break

        # for index, thread in enumerate(threads):
        #     thread.join()

        log_info(
            message=
            f'after mapped thread_data, there are {len(thread_data)} mapped items'
        )
        #thread_data = thread_data[:500]
        #log_info(message=f'to start with only processing {len(thread_data)} thread_data items')
        pool = ThreadPool(thread_count)
        results = pool.map(self.ProcessDirectory, thread_data)
        pool.close()
        pool.join()

        self.moveProcessedFiles()

        self.events.add_log("LoopHashDirectoriesInternal finished")
        return True

    async def LoopHashDirectoriesAsync(self, thread_count, do_single=False):
        await Loops.lock.acquire()
        try:
            Loops.continue_processing = True
            Loops.processing_started = True
            self.status.set_started()
            self.LoopHashDirectoriesInternal(thread_count, do_single)
        finally:
            Loops.processing_started = False
            Loops.lock.release()
            self.status.set_stopped()
            self.hash_json.save()

    @log_duration
    def LoopHashDirectories(self, thread_count=None):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                message=
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        self.status.StartStatusThread()
        thread_count = thread_count or self.config.thread_count
        log_info(message="in LoopHashDirectories, about to start main loop")
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(thread_count))
        log_info(message="in LoopHashDirectories, Loop completed")
        self.status.StopStatusThread()
        return True

    @log_duration
    def LoopHashDirectoriesSequential(self):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1))
        return True

    @log_duration
    def ProcessSingleFile(self):
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1, True))
        return True
class test_Hash_Json(TestCase):

    test_file = None

    @classmethod
    def setUpClass(cls) -> None:
        cls.test_file      = temp_file(contents='Static text so that we have a static hash')
        cls.test_file_name = file_name(cls.test_file)
        cls.test_file_hash = '500286533bf75d769e9180a19414d1c3502dd52093e7351a0a9b1385d8f8961c'

    @classmethod
    def tearDownClass(cls) -> None:
        file_delete(cls.test_file)

    def setUp(self) -> None:
        self.hash_json = Hash_Json()
        self.storage   = self.hash_json.storage

    def test___init__(self):
        assert abspath(self.hash_json.folder()) == self.storage.hd2_status()

    @patch("multiprocessing.queues.Queue.put_nowait")
    def test_add_file(self, patch_log_error):
        hash_data = self.hash_json.load()
        if hash_data.get('self.test_file_hash'):
            del hash_data[self.test_file_hash]

        assert self.hash_json.add_file(self.test_file_hash, self.test_file_name) is True
        self.hash_json.save()
        assert hash_data.get(self.test_file_hash) == {'file_name': self.test_file_name, 'file_status': 'Initial'}

        assert self.hash_json.add_file('AAAA'              , self.test_file_name) is False
        assert self.hash_json.add_file(self.test_file_hash , None               ) is False
        assert self.hash_json.add_file(None                , None               ) is False

        assert patch_log_error.mock_calls == [call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': 'AAAA'             , 'file_name': self.test_file_name}, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'}),
                                              call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': self.test_file_hash, 'file_name': None               }, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'}),
                                              call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': None               , 'file_name': None               }, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'})]



    def test_get_file_path(self):
        file_path = abspath(self.hash_json.get_file_path())
        assert file_exists(file_path)
        assert file_path == path_combine(self.storage.hd2_status(), Hash_Json.HASH_FILE_NAME)

    def test_load(self):
        data = self.hash_json.load()
        assert type(data) is dict
        assert self.hash_json.data() == data

    def test_data(self):
        assert self.hash_json.data() == self.hash_json._hash_json_data

    def test_is_hash(self):
        test_file   = temp_file(contents='aaaa')
        file_hash   = Metadata_Utils().file_hash(test_file)                         # create hash from file
        text_hash   = str_sha256('asd')                                             # create hash from string

        assert self.hash_json.is_hash(file_hash         ) is True                   # confirm both are valid hashes
        assert self.hash_json.is_hash(text_hash         ) is True

        assert self.hash_json.is_hash(None              ) is False                  # testing all sorts of conner cases
        assert self.hash_json.is_hash(''                ) is False                  # empty strings
        assert self.hash_json.is_hash('aaaa'            ) is False                  # non hash string
        assert self.hash_json.is_hash(file_hash + 'aaaa') is False                  # confirm only exact matches work
        assert self.hash_json.is_hash(text_hash + 'aaaa') is False
        assert self.hash_json.is_hash('aaa' + file_hash ) is False
        assert self.hash_json.is_hash(text_hash + '\nb`') is False                  # confirm content in new lines is also not a match
        assert self.hash_json.is_hash('a\n' + file_hash ) is False

        file_delete(test_file)

    def test_save(self):
        target_file = temp_file()                                                   # temp file to save data
        assert file_not_exists(target_file)                                         # confirm it doesn't exist
        with patch.object(Hash_Json, 'get_file_path', return_value=target_file):    # patch get_file_path to return temp file path
            assert self.hash_json.get_file_path() == target_file                    # confirm patch is in place
            self.hash_json.save()                                          # call write_to_file
            assert file_exists(target_file)                                         # confirm temp file now exists
            assert self.hash_json.load() == self.hash_json.data()                     # confirm reloaded data is correct
            assert json_load_file(target_file)    == self.hash_json.data()            # also confirm using direct json load of temp file
        assert self.hash_json.get_file_path()     != target_file                    # confirm pathc is not there (after 'with' ends)
        file_delete(target_file)                                                    # delete temp file

    def test_update_status(self):
        temp_data_file = temp_file()
        with patch.object(Hash_Json, 'get_file_path', return_value=temp_data_file):
            self.hash_json.add_file(self.test_file_hash, self.test_file_name)
            assert self.hash_json.data()[self.test_file_hash]['file_status'] == 'Initial'
            self.hash_json.update_status(self.test_file_hash, 'BBBB')
            self.hash_json.save()
            assert self.hash_json.data()[self.test_file_hash]['file_status'] == 'BBBB'
            assert json_load_file(temp_data_file)[self.test_file_hash]['file_status'] == 'BBBB'
        pprint(self.hash_json.load())

    def test_data_bug(self):                                            # this test confirms the bug
        hashes = self.hash_json.data()
        for hash in self.hash_json.data():
            if len(hash) == 64:                                         # all keys in this object should be a hash
                assert len(hash) == 64
                assert type(hashes[hash]) == dict                       # with all items being a dictionary
                assert list_set(hashes[hash]) == ['file_name', 'file_status']
            else:
                assert hash == "file_list"                              # but the old schema is still present
                assert type(hashes[hash]) == list                       # with the data being a list
                assert list_set(hashes[hash][0]) == ['file_name', 'file_status', 'hash', 'id']