def process_sessions_real(coordinators, updates_directory, index_filename, pickle_root, result_pickle_root, num_workers=None): if num_workers != 0: pool = Pool(processes=num_workers) session_context_manager = SessionContextManager() session_context_manager.declare_persistent_state( 'filenames_processed', set, None) session_context_manager.declare_persistent_state( 'last_sequence_number_processed', return_negative_one, None) for coordinator in coordinators: for name, (init_func, merge_func) \ in coordinator.persistent_state.iteritems(): session_context_manager.declare_persistent_state( name, init_func, merge_func) for name, (init_func, merge_func) \ in coordinator.ephemeral_state.iteritems(): session_context_manager.declare_ephemeral_state( name, init_func, merge_func) print 'Preparing processors' process_args = [] index = UpdatesIndex(index_filename) for session in index.sessions: processors = [] for coordinator in coordinators: processors.append(coordinator.create_processor(session)) update_files = index.session_data(session) process_args.append((session, session_context_manager, pickle_root, result_pickle_root, processors, update_files, updates_directory)) print 'Processing sessions' global_context = GlobalContext() if num_workers == 0: for args in process_args: pickle_path = process_session_wrapper(args) session_context = session_context_manager.load_context(pickle_path) session_context_manager.merge_contexts(session_context, global_context) del session_context else: results = pool.imap_unordered(process_session_wrapper, process_args) for pickle_path in results: session_context = session_context_manager.load_context(pickle_path) session_context_manager.merge_contexts(session_context, global_context) del session_context pool.close() pool.join() print 'Post-processing' for coordinator in coordinators: coordinator.finished_processing(global_context)
def index_traces(updates_directory, index_filename): index = UpdatesIndex(index_filename) tarnames_processed = index.tarnames for tarname in glob(join(updates_directory, '*.tar')): if basename(tarname) not in tarnames_processed: try: true_sum = splitext(basename(tarname))[0].split('_')[3] except IndexError: true_sum = None if true_sum is not None: hasher = md5() hasher.update(open(tarname, 'r').read()) if hasher.hexdigest() != true_sum: print 'skipping', tarname, '(invalid hash)' continue tarball = tarfile.open(tarname, 'r') for tarmember in tarball.getmembers(): tarhandle = tarball.extractfile(tarmember.name) update_content = GzipFile(fileobj=tarhandle).read() update = PassiveUpdate(update_content, onlyheaders=True) if update.anonymized: signature = update.anonymization_signature else: signature = 'unanonymized' index.index(basename(tarname), tarmember.name, update.bismark_id, signature, update.creation_time, update.sequence_number, len(update_content)) index.finalize_indexing()