def update_processed_count(self, n): """Increment the count of files processed in the state directory.""" processed_filename = os.path.join(self.path, 'state', 'processed.txt') files_processed = int(open(processed_filename).read().strip()) new_count = files_processed + n self.files_processed = new_count create_file(processed_filename, str(new_count))
def update_state(self, limit, t1): """Update the content of state/processed.txt and state/processing-history.txt.""" # TODO: should not just print the files processed in the history, but also the # range of files. time_elapsed = time.time() - t1 processed = "%d\n" % self.files_processed create_file(os.path.join(self.path, 'state', 'processed.txt'), processed) history_file = os.path.join(self.path, 'state', 'processing-history.txt') fh = open(history_file, 'a') fh.write("%s\t%d\t%s\t%s\t%s\n" % (self.stage_name, limit, time.strftime("%Y:%m:%d-%H:%M:%S"), get_git_commit(), time_elapsed))
def initialize_on_disk(self): """All that is guaranteed to exist is a directory like data/patents/en/d1_txt, but sub structures is not there. Create the substructure and initial versions of all needed files in configuration and state directories.""" for subdir in ('config', 'state', 'files'): ensure_path(os.path.join(self.path, subdir)) create_file(os.path.join(self.path, 'state', 'processed.txt'), "0\n") create_file(os.path.join(self.path, 'state', 'processing-history.txt')) trace, head = self.split_pipeline() trace_str = pipeline_component_as_string(trace) head_str = pipeline_component_as_string([head]) create_file(os.path.join(self.path, 'config', 'pipeline-head.txt'), head_str) create_file(os.path.join(self.path, 'config', 'pipeline-trace.txt'), trace_str) self.files_processed = 0