def index_processed_file(index, writer): datafile = os.path.join(GPFS_STORAGE, "gram2_%s.processed" % str(index)) with open(datafile, 'r') as f: pos = 0 line = f.readline() while True: if is_parent_line(line): word, skip_lines, _ = parse_parent_line(line) starting_pos = pos md5 = hashlib.md5() md5.update(word) word_hash = md5.hexdigest() for i in range(0, skip_lines): f.readline() chunk_size = f.tell() - starting_pos index_entry = IndexEntry(word_hash, index, starting_pos, chunk_size) writer.write(index_entry.pack()) pos = f.tell() else: if line == '': break # last line is empty in the data file, we are done here else: raise ValueError('Improper data file %s' % datafile) line = f.readline()
def extract_parent_word(index, starting, chunk_size): datafile = os.path.join(GPFS_STORAGE, "gram2_%s.processed" % str(index)) with open(datafile, 'r') as df: df.seek(starting) lines = df.read(chunk_size).split("\n") line = lines[0] if not is_parent_line(line): return None parent_word, _, counts = parse_parent_line(lines[0]) return parent_word
def create_filter(datafile, force=False): assert os.path.isfile(datafile) datadir, datafilename = os.path.split(datafile) filter_file = os.path.join(datadir, datafilename + ".filter") if force or not os.path.isfile(filter_file): bf = BloomFilter(capacity=1e6) with open(datafile) as df: line = next(df) try: while True: if is_parent_line(line): word, skips, _ = parse_parent_line(line) bf.add(word) for i in xrange(1, skips): next(df) line = next(df) except StopIteration: with open(filter_file, 'w') as ff: bf.tofile(ff) del bf print("%s done." % filter_file)