def enumerate_set_ids(fh, progress_func=lambda x: 0): """Return list of integers for set_ids from a file handle. This funtion resets the file position. Assumes the input is (member_id, set_id) * N in binary. """ fh.seek(0) set_ids = set() for readbytes in itertools.count(start=0, step=BUFFERSIZE): ints = fill_buffer(fh, BUFFERSIZE) # grab every other integer, skipping the first one new_set_ids = (ints[i+1] for i in xrange(0, len(ints), 2)) set_ids.update(set(new_set_ids)) progress_func(readbytes, mb=100) if len(ints) != (BUFFERSIZE / SIZEOFINT): return list(set_ids)
def enumerate_set_ids(fh, progress_func=lambda x: 0): """Return list of integers for set_ids from a file handle. This funtion resets the file position. Assumes the input is (member_id, set_id) * N in binary. """ fh.seek(0) set_ids = set() for readbytes in itertools.count(start=0, step=BUFFERSIZE): ints = fill_buffer(fh, BUFFERSIZE) # grab every other integer, skipping the first one new_set_ids = (ints[i + 1] for i in xrange(0, len(ints), 2)) set_ids.update(set(new_set_ids)) progress_func(readbytes, mb=100) if len(ints) != (BUFFERSIZE / SIZEOFINT): return list(set_ids)
def extract_membership(set_id_segment, membership_fh): set_membership = dict((set_id, []) for set_id in set_id_segment) set_id_segment_set = set(set_id_segment) membership_fh.seek(0) # reset file # read entire data file until we've hit EOF try: for readbytes in itertools.count(0, BUFFERSIZE): pairs = in_pairs(fill_buffer(membership_fh, BUFFERSIZE)) for member_id, set_id in pairs: if set_id in set_id_segment_set: set_membership[set_id].append(member_id) progress_func(readbytes, mb=100) if len(pairs) != (BUFFERSIZE / SIZEOFINT / 2): raise EOFError except EOFError: pass return set_membership