예제 #1
0
def index_processed_file(index, writer):
    datafile = os.path.join(GPFS_STORAGE, "gram2_%s.processed" % str(index))
    with open(datafile, 'r') as f:
        pos = 0
        line = f.readline()
        while True:
            if is_parent_line(line):
                word, skip_lines, _ = parse_parent_line(line)
                starting_pos = pos
                md5 = hashlib.md5()
                md5.update(word)
                word_hash = md5.hexdigest()
                for i in range(0, skip_lines):
                    f.readline()

                chunk_size = f.tell() - starting_pos
                index_entry = IndexEntry(word_hash, index, starting_pos, chunk_size)
                writer.write(index_entry.pack())
                pos = f.tell()
            else:
                if line == '':
                    break # last line is empty in the data file, we are done here
                else:
                    raise ValueError('Improper data file %s' % datafile)

            line = f.readline()
예제 #2
0
def extract_parent_word(index, starting, chunk_size):
    datafile = os.path.join(GPFS_STORAGE, "gram2_%s.processed" % str(index))
    with open(datafile, 'r') as df:
        df.seek(starting)
        lines = df.read(chunk_size).split("\n")

    line = lines[0]
    if not is_parent_line(line):
        return None

    parent_word, _, counts = parse_parent_line(lines[0])
    return parent_word
예제 #3
0
def create_filter(datafile, force=False):
    assert os.path.isfile(datafile)
    datadir, datafilename = os.path.split(datafile)
    filter_file = os.path.join(datadir, datafilename + ".filter")
    if force or not os.path.isfile(filter_file):
        bf = BloomFilter(capacity=1e6)
        with open(datafile) as df:
            line = next(df)
            try:
                while True:
                    if is_parent_line(line):
                        word, skips, _ = parse_parent_line(line)
                        bf.add(word)
                        for i in xrange(1, skips):
                            next(df)
                    line = next(df)
            except StopIteration:
                with open(filter_file, 'w') as ff:
                    bf.tofile(ff)
                del bf

        print("%s done." % filter_file)