def build_master_index(cache=False): master_index = dict() size = IndexEntry.size() for node in localnode.nodes(): # print("Processing %s_index" % node) node_index_file = os.path.join(GPFS_STORAGE, "%s_index" % node) with open(node_index_file, 'r') as f: while True: chunk = f.read(size) if chunk == '': break index_entry = IndexEntry.unpack(chunk) strid = str(index_entry.id) if strid not in master_index: master_index[strid] = [] index_content = (index_entry.index, index_entry.offset, index_entry.chunk_size) master_index[strid].append(index_content) if cache: print("putting it to memcache") for key in master_index: memcached.set(key, master_index[key]) print("done") return master_index
def process(): print("Starting up...") offset = localnode.index_offset nnodes = len( localnode.nodes() ) words_index = build_master_index() n = -1 new_index = dict() print("Processing hash by hash...") it = gen_files() outfile, outfile_name = it.next() for hash32 in words_index: n += 1 if n % nnodes != offset: continue data = load_hash32(hash32, words_index) if len(data) == 0: continue # hex decimal for word, content in data.iteritems(): print("writing data for: %s" % word) start_pos, end_pos, has_space = write_data_main(outfile, word, content) new_index[word] = { "file" : outfile_name, "start" : start_pos, "chunk_size" : end_pos - start_pos } print("Bytes: %d" % end_pos) if not has_space: print("%s is full" % outfile) outfile, outfile_name = it.send(outfile) print("moving on to %s" % outfile) it.send(outfile) return new_index