예제 #1
0
def process_sessions_real(coordinators,
                          updates_directory,
                          index_filename,
                          pickle_root,
                          result_pickle_root,
                          num_workers=None):
    if num_workers != 0:
        pool = Pool(processes=num_workers)

    session_context_manager = SessionContextManager()
    session_context_manager.declare_persistent_state(
            'filenames_processed', set, None)
    session_context_manager.declare_persistent_state(
            'last_sequence_number_processed', return_negative_one, None)
    for coordinator in coordinators:
        for name, (init_func, merge_func) \
                in coordinator.persistent_state.iteritems():
            session_context_manager.declare_persistent_state(
                    name, init_func, merge_func)
        for name, (init_func, merge_func) \
                in coordinator.ephemeral_state.iteritems():
            session_context_manager.declare_ephemeral_state(
                    name, init_func, merge_func)

    print 'Preparing processors'
    process_args = []
    index = UpdatesIndex(index_filename)
    for session in index.sessions:
        processors = []
        for coordinator in coordinators:
            processors.append(coordinator.create_processor(session))
        update_files = index.session_data(session)
        process_args.append((session,
                             session_context_manager,
                             pickle_root,
                             result_pickle_root,
                             processors,
                             update_files,
                             updates_directory))

    print 'Processing sessions'
    global_context = GlobalContext()
    if num_workers == 0:
        for args in process_args:
            pickle_path = process_session_wrapper(args)
            session_context = session_context_manager.load_context(pickle_path)
            session_context_manager.merge_contexts(session_context, global_context)
            del session_context
    else:
        results = pool.imap_unordered(process_session_wrapper, process_args)
        for pickle_path in results:
            session_context = session_context_manager.load_context(pickle_path)
            session_context_manager.merge_contexts(session_context, global_context)
            del session_context
        pool.close()
        pool.join()

    print 'Post-processing'
    for coordinator in coordinators:
        coordinator.finished_processing(global_context)
def index_traces(updates_directory, index_filename):
    index = UpdatesIndex(index_filename)
    tarnames_processed = index.tarnames
    for tarname in glob(join(updates_directory, '*.tar')):
        if basename(tarname) not in tarnames_processed:
            try:
                true_sum = splitext(basename(tarname))[0].split('_')[3]
            except IndexError:
                true_sum = None
            if true_sum is not None:
                hasher = md5()
                hasher.update(open(tarname, 'r').read())
                if hasher.hexdigest() != true_sum:
                    print 'skipping', tarname, '(invalid hash)'
                    continue

            tarball = tarfile.open(tarname, 'r')
            for tarmember in tarball.getmembers():
                tarhandle = tarball.extractfile(tarmember.name)
                update_content = GzipFile(fileobj=tarhandle).read()
                update = PassiveUpdate(update_content, onlyheaders=True)
                if update.anonymized:
                    signature = update.anonymization_signature
                else:
                    signature = 'unanonymized'
                index.index(basename(tarname),
                            tarmember.name,
                            update.bismark_id,
                            signature,
                            update.creation_time,
                            update.sequence_number,
                            len(update_content))
    index.finalize_indexing()
예제 #3
0
def index_traces(updates_directory, index_filename):
    index = UpdatesIndex(index_filename)
    tarnames_processed = index.tarnames
    for tarname in glob(join(updates_directory, '*.tar')):
        if basename(tarname) not in tarnames_processed:
            try:
                true_sum = splitext(basename(tarname))[0].split('_')[3]
            except IndexError:
                true_sum = None
            if true_sum is not None:
                hasher = md5()
                hasher.update(open(tarname, 'r').read())
                if hasher.hexdigest() != true_sum:
                    print 'skipping', tarname, '(invalid hash)'
                    continue

            tarball = tarfile.open(tarname, 'r')
            for tarmember in tarball.getmembers():
                tarhandle = tarball.extractfile(tarmember.name)
                update_content = GzipFile(fileobj=tarhandle).read()
                update = PassiveUpdate(update_content, onlyheaders=True)
                if update.anonymized:
                    signature = update.anonymization_signature
                else:
                    signature = 'unanonymized'
                index.index(basename(tarname), tarmember.name,
                            update.bismark_id, signature, update.creation_time,
                            update.sequence_number, len(update_content))
    index.finalize_indexing()