示例#1
0
def process_file(descr):
    """Process a single file."""

    while not allowed_to_dispatch():
        time.sleep(300)

    n, prefix = descr

    local_root = args.output
    remote_root = "http://storage.googleapis.com/books/ngrams/books/"

    filename = ngram_filename(n, prefix)
    local_path = os.path.join(local_root, filename)
    remote_path = urllib.parse.urljoin(remote_root, filename + ".gz")

    def print_status(message, filename):
        time = datetime.datetime.now()
        print("{time} {message} {filename}".format(**locals()))

    with open_file_to_process(local_path, "wb") as f:
        if f == False:
            print_status("Skipped", filename)
            raise FileAlreadyProcessed()

        print_status("Processing", filename)

        # Generate iterators over ngrams
        source_ngrams = iter_remote_gzip(remote_path)
        processed_ngrams = integrate_pure_ngram_counts(source_ngrams, n)

        # Save the integrated ngram counts to a file
        ngrams_iter2file(processed_ngrams, f)

        print_status("Finished", filename)
示例#2
0
def process_file(n, prefix):
    """Process a single file."""

    filename = ngram_filename(n, prefix)
    output_path = os.path.join(args.output, filename)

    with open_file_to_process(output_path, "w") as o:
        if o == False:
            print_status("Skipped", filename)
            raise FileAlreadyProcessed()

        print_status("Processing", filename)

        if numeric_token(prefix):
            return

        input_path = os.path.join(args.input, filename)
        with open(input_path, "r") as i:
            for line in itertools.filterfalse(contains_digits, i):
                o.write(line)
示例#3
0
def output_ngram(l, count, out):
    """
    Output a normalised ngram to an appropriate file. The input ngram includes
    empty tokens.
    """
    n = len(l)

    # See if an appropriate output file is already open
    prefix = normalised_token_prefix(l[0], n)
    if (n, prefix) not in out:
        # Close all files if too many are open. The elegant way would be to
        # maintain the files in the order of last access and close only the one
        # that was accessed longest time ago, but this hack works for now and
        # efficiency is not key in this script.
        if len(out) > 1000:
            close_output_files(out)

        filename = ngram_filename(n, prefix)
        path = os.path.join(args.output, filename)
        out[(n, prefix)] = open(path, "a")

    # Write the ngram to the output file
    out[(n, prefix)].write("\t".join(l + (count,)))
    with open(args.ngrams, 'r') as f:
        ngrams = json.load(f)

    # Partitions are the 1gram prefixes ordered alphabetically
    partitions = sorted(ngrams["1"])
    partitions_set = frozenset(partitions)

    # Dictionary holding cumulative frequency ranges in each partition
    cumfreq_ranges = {}

    for n in sorted(ngrams.keys()):
        # Calculate total frequencies in each partition
        cumfreqs = {}
        for prefix in ngrams[n]:
            partition = get_partition(prefix, partitions_set)
            path = os.path.join(args.input, ngram_filename(n,prefix))
            cumfreqs[partition] = (cumfreqs.get(partition, 0)
                + calculate_cumfreq(path))
            print("Counted cumulative frequency for FILE {path}".format(
                **locals()))

        # Calculate cumulative frequency ranges in each partition
        cumfreq_ranges[n] = {}
        cumfreq = 0
        for partition in partitions:
            cumfreq_ranges[n][partition] = (cumfreq,
                                            cumfreq + cumfreqs[partition])
            cumfreq += cumfreqs[partition]

    with open(args.output, "w") as f:
        json.dump(cumfreq_ranges, f)
def upload_ngrams(n, prefixes, index_ranges, cumfreq_ranges):
    """Upload ngrams for a particular n to the PostgreSQL database."""
    def get_column_definitions(n):
        return ",\n".join(
            map(lambda x: "w{} INTEGER".format(x), range(1, n + 1)))

    def get_column_names(n):
        return ", ".join(map(lambda x: "w{}".format(x), range(1, n + 1)))

    # Generate table and columns definitions
    table = get_table_name(args.dataset, "{n}grams".format(**locals()))
    context_table = get_table_name(args.dataset,
                                   "{n}grams__context".format(**locals()))
    column_definitions = get_column_definitions(n)
    columns = get_column_names(n)

    if not is_completed("{n}grams_create_parent_tables".format(**locals())):
        # Create parent ngrams table
        cur.execute("""
            DROP TABLE IF EXISTS {table} CASCADE;

            CREATE TABLE {table} (
              i SERIAL,
              {column_definitions},
              cf1 BIGINT,
              cf2 BIGINT
            );
            """.format(**locals()))
        print("Created TABLE {table}".format(**locals()))

        # Create parent context table
        if n > 1:
            context_column_definitions = get_column_definitions(n - 1)
            context_columns = get_column_names(n - 1)

            cur.execute("""
                DROP TABLE IF EXISTS {context_table} CASCADE;

                CREATE TABLE {context_table} (
                  i SERIAL,
                  {context_column_definitions},
                  cf1 BIGINT,
                  cf2 BIGINT
                );
                """.format(**locals()))
        else:
            cur.execute("""
                DROP TABLE IF EXISTS {context_table};

                CREATE TABLE {context_table} (
                  i SERIAL PRIMARY KEY,
                  cf1 BIGINT,
                  cf2 BIGINT
                );
                """.format(**locals()))
        print("Created context TABLE {context_table}".format(**locals()))

        # Commit defining parent tables
        conn.commit()
        complete("{n}grams_create_parent_tables".format(**locals()))

    # Populate respective partition tables
    for partition in sorted(prefixes.keys()):
        if is_completed(
                "{n}grams_{partition}_analyse_partition".format(**locals())):
            continue

        # Define various properties of the partition table, such as its name and
        # the range of data it is supposed to contain
        partition_table = get_table_name(
            args.dataset, "{n}grams_{partition}".format(**locals()))
        index_range = index_ranges[partition]
        cumfreq_range = cumfreq_ranges[partition]

        if not is_completed(
                "{n}grams_{partition}_create_tables".format(**locals())):
            # Create the partition table
            cur.execute("""
                DROP TABLE IF EXISTS {partition_table};

                CREATE TABLE {partition_table} (
                  PRIMARY KEY (i),
                  CHECK (     w1 >= {index_range[0]}
                          AND w1 <= {index_range[1]}
                          AND cf1 >= {cumfreq_range[0]}
                          AND cf1 <= {cumfreq_range[1]}
                          AND cf2 >= {cumfreq_range[0]}
                          AND cf2 <= {cumfreq_range[1]} )
                ) INHERITS ({table});
                """.format(**locals()))
            print(
                "Created partition TABLE {partition_table}".format(**locals()))

            # If n > 1, then data in the context table should be partitioned too
            if n > 1:
                context_partition_table = get_table_name(
                    args.dataset,
                    "{n}grams_{partition}__context".format(**locals()))

                cur.execute("""
                    DROP TABLE IF EXISTS {context_partition_table};

                    CREATE TABLE {context_partition_table} (
                      PRIMARY KEY (i),
                      CHECK (     w1 >= {index_range[0]}
                              AND w1 <= {index_range[1]}
                              AND cf1 >= {cumfreq_range[0]}
                              AND cf1 <= {cumfreq_range[1]}
                              AND cf2 >= {cumfreq_range[0]}
                              AND cf2 <= {cumfreq_range[1]} )
                    ) INHERITS ({context_table});
                    """.format(**locals()))
                print("Created context partition TABLE "
                      "{context_partition_table}".format(**locals()))

            # Commit creating ngrams and context partition tables
            conn.commit()
            complete("{n}grams_{partition}_create_tables".format(**locals()))

        for prefix in prefixes[partition]:
            if is_completed(
                    "{n}grams_{prefix}_analyse_prefix".format(**locals())):
                continue

            path = os.path.join(args.input, ngram_filename(n, prefix))
            raw_tmp_table = get_table_name(
                args.dataset, "tmp_raw__{n}grams_{prefix}".format(**locals()))
            cumfreq_tmp_table = get_table_name(
                args.dataset,
                "tmp_cumfreq__{n}grams_{prefix}".format(**locals()))

            # Copy ngrams starting with a particular prefix into a temporary
            # table and cumulate their frequencies
            cur.execute(
                """
                DROP TABLE IF EXISTS {raw_tmp_table};

                CREATE TABLE {raw_tmp_table} (
                  i SERIAL PRIMARY KEY,
                  {column_definitions},
                  f BIGINT
                );

                DROP TABLE IF EXISTS {cumfreq_tmp_table};

                CREATE TABLE {cumfreq_tmp_table} (
                  i SERIAL PRIMARY KEY,
                  {column_definitions},
                  cf1 BIGINT,
                  cf2 BIGINT
                );

                COPY
                  {raw_tmp_table} ({columns}, f)
                FROM
                  %s;

                INSERT INTO
                  {cumfreq_tmp_table} ({columns}, cf1, cf2)
                SELECT
                  {columns},
                  sum(f) OVER (ORDER BY {columns} ASC) - f
                    + (SELECT coalesce(max(cf2),0) FROM {table}) AS cf1,
                  sum(f) OVER (ORDER BY {columns} ASC)
                    + (SELECT coalesce(max(cf2),0) FROM {table}) AS cf2
                FROM
                  {raw_tmp_table};

                DROP TABLE {raw_tmp_table};
                """.format(**locals()), (path, ))
            print("Copied FILE {path} to TABLE {cumfreq_tmp_table}".format(
                **locals()))

            # Insert ngrams with this prefix into the partition table
            cur.execute("""
                INSERT INTO
                  {partition_table} ({columns}, cf1, cf2)
                SELECT
                  {columns}, cf1, cf2
                FROM
                  {cumfreq_tmp_table}
                ORDER BY
                  i ASC;
                """.format(**locals()))
            print("Copied TABLE {cumfreq_tmp_table} to TABLE "
                  "{partition_table}".format(**locals()))

            # Insert ngrams with this prefix into the context partition table
            if n > 1:
                cur.execute("""
                  INSERT INTO
                    {context_partition_table} ({context_columns}, cf1, cf2)
                  SELECT
                    {context_columns},
                    min(cf1) AS cf1,
                    max(cf2) AS cf2
                  FROM
                    {cumfreq_tmp_table}
                  GROUP BY
                    {context_columns}
                  -- This is much faster than "ORDER BY min(i)", can investigate
                  ORDER BY
                    {context_columns} ASC;
                  """.format(**locals()))
                print(
                    "Cumulated and copied TABLE {cumfreq_tmp_table} to TABLE "
                    "{context_partition_table}".format(**locals()))

            cur.execute("""
              DROP TABLE {cumfreq_tmp_table};
              """.format(**locals()))

            # Commit changes due to processing a single prefix file
            conn.commit()
            complete("{n}grams_{prefix}_analyse_prefix".format(**locals()))

        # Index the ngrams partition table. Making the index on columns unique
        # ensures that no leaves of the probability tree are duplicated.
        cur.execute("""
            CREATE UNIQUE INDEX ON {partition_table}
                USING btree ({columns})
                WITH (fillfactor = 100);

            CREATE UNIQUE INDEX ON {partition_table}
                USING btree (cf1, cf2)
                WITH (fillfactor = 100);
            """.format(**locals()))
        print("Created UNIQUE INDEXES on ({columns}) and (cf1, cf2) in TABLE "
              "{partition_table}".format(**locals()))

        # Index the ngrams context partition table. Since ngrams are added from
        # the prefix files sequentially, if it happened that two ngrams starting
        # with the same (w1, ..., w(n-1)) were wrongly put in different prefix
        # files, an error will occur. Ngrams starting with the same (w1, ...,
        # w(n-2)) are not a problem, since we will always query for P(w(n) | w1,
        # ..., w(n-1)).
        if n > 1:
            cur.execute("""
              CREATE UNIQUE INDEX ON {context_partition_table}
                  USING btree ({context_columns})
                  WITH (fillfactor = 100);
              """.format(**locals()))
            print("Created UNIQUE INDEX on ({context_columns}) in TABLE "
                  "{context_partition_table}".format(**locals()))

        # Commit indexing ngrams and context tables after processing all
        # corresponding prefix files
        conn.commit()
        complete("{n}grams_{partition}_analyse_partition".format(**locals()))

    # Create context for 1 grams
    if n == 1:
        cur.execute("""
          INSERT INTO
            {context_table} (cf1, cf2)
          SELECT
            min(cf1) AS cf1,
            max(cf2) AS cf2
          FROM
            {table};
          """.format(**locals()))
        print("Cumulated and copied TABLE {table} to TABLE "
              "{context_table}".format(**locals()))

        # Commit creating context for 1grams
        conn.commit()

    complete("{n}grams_analyse".format(**locals()))
示例#6
0
def process_file(n, prefix):
    """
    Process a single file. Since ngrams will change size and partition, they
    will be appended to existing files containing ngram counts from other prefix
    files. As a result, changes introduces by partial processing of a file
    cannot be rolled back easily -- there is no progress tracking, the whole
    script needs to be restarted from scratch if interrupted midway.
    """

    filename = ngram_filename(n, prefix)
    path = os.path.join(args.input, filename)

    print_status("Processing", filename)

    # Dictionary of all possible output files
    out = dict()

    with open(path, "r") as i:
        for line in i:
            l_original = line.split("\t")

            # Normalise and explode original tokens
            l = tuple(normalise_and_explode_token(t) for t in l_original[:-1])

            # Count the exploded size of each original token
            s = tuple(len(t) for t in l)

            # Discard ngrams with empty original edge tokens - a lower order
            # ngram already handles these counts
            if s[0] == 0 or s[-1] == 0:
                continue

            # There are at least two original tokens, so both edge tokens exist
            if n >= 2:
                # Count the total exploded size of middle original tokens, these
                # have to be included in the output
                middle_s = sum(s[1:-1])

                # Count the maximum number of normalised tokens that can come
                # from the original edge tokens
                max_edge_s = args.n_max - middle_s

                # There are too many exploded middle tokens -- the normalised
                # ngram including at least one normalised token from each
                # original edge token would be beyond the order of the model
                if max_edge_s < 2:
                    continue

                # Flatten the original middle tokens
                l_middle = tuple(itertools.chain.from_iterable(l[1:-1]))

                # Consider every combination of normalised edge tokens -- they
                # need to be adjacent to the middle tokens
                for ls in range(1,min(max_edge_s,s[0])+1):
                    for rs in range(1,min(max_edge_s-ls,s[-1])+1):
                        output_ngram(l[0][-ls:] + l_middle + l[-1][:rs],
                                     l_original[-1], out)

            # There is only one original token
            else:
                for start in range(s[0]):
                    for stop in range(start+1, min(start+args.n_max,s[0])+1):
                        output_ngram(l[0][start:stop], l_original[-1], out)

    close_output_files(out)

    print_status("Finished", filename)
示例#7
0
    # partition "_" that words from BS_SPECIAL_PREFIXES belong to.
    part2pref = {p:(p,) for p in ngrams["1"] if p not in BS_SPECIAL_PREFIXES}
    part2pref["_"] = BS_SPECIAL_PREFIXES

    # Verify that the implicitly created partitions are correct
    assert(set(part2pref.keys()) == set(BS_PARTITION_NAMES))

    # Go over all partitions and read words from the corresponding prefix files
    gen_index = count(1)
    with open(args.output, "w") as fo:
        for part in BS_PARTITION_NAMES:
            # Initialise all words
            if part == "_":
                words = {"_START_", "_END_"}
            else:
                words = set()

            # Read words from respective prefix files
            for pref in part2pref[part]:
                path = os.path.join(args.input, ngram_filename(1, pref))
                if os.path.isfile(path):
                    with open(path, "r") as fi:
                        for line in fi:
                            words.add(line.split("\t")[0])
                    print("Read words from {path}".format(**locals()))

            # Dump words to the index file
            for w, i in zip(sorted(words), gen_index):
                fo.write("{i}\t{w}\t{part}\n".format(**locals()))
            print("Dumped {part} partition".format(**locals()))
示例#8
0
 def pref_path(pref):
     """Give path to a prefix file."""
     return os.path.join(args.input, ngram_filename(n, pref))
示例#9
0
def write_ngrams_table(n, prefixes):
    """Writes ngrams counts table for a particular n."""
    def pref_path(pref):
        """Give path to a prefix file."""
        return os.path.join(args.input, ngram_filename(n, pref))

    # Prepare a part2pref dictionary of prefixes corresponding to partitions
    part2pref = {part: set() for part in BS_PARTITION_NAMES}
    for pref in prefixes:
        # Determine which prefix files actually exist. This introduces a race
        # condition, however the assumption is that database will not be
        # modified while this script is running.
        if os.path.exists(pref_path(pref)):
            if pref in BS_SPECIAL_PREFIXES:
                part2pref["_"].add(pref)
            else:
                part2pref[pref[0]].add(pref)

    # Format specifier for a line of the bindb file
    fmt = bindb.fmt(n)

    # Format specifier for the numpy matrix used for sorting the ngrams
    dtp = (
        # n * little-endian 4 byte integers with token indices
        [("w{}".format(i), "<i4") for i in range(n)] +
        # little-endian 8 byte integer with ngram count
        [("f", "<i8")])

    # Create the bindb file
    output_path = os.path.join(args.output, "{n}gram".format(**locals()))
    with open(output_path, "wb") as fo:
        # Go over the prefix files for each possible partitions
        for part in BS_PARTITION_NAMES:
            # Sort the set of prefixes which will contribute to this partition
            # to take advantage of partial sorting (ngrams belonging to the same
            # prefix will still be adjacent in the sorted partition)
            prefs = sorted(part2pref[part])

            # Calculate the maximum number of ngrams in the partition by
            # counting total number of lines in each prefix file
            ngrams_maxn = sum(
                sum(1 for line in open(pref_path(pref), "r"))
                for pref in prefs)

            # Create a numpy array that can contain all potential ngrams
            ngrams = zeros(ngrams_maxn, dtype=dtp)

            # Read one by one prefix files corresponding to the partition
            i = 0
            for pref in prefs:
                # Simultaneously read ngrams from the prefix file and write
                # those which don't match to the error file
                filename = ngram_filename(n, pref)
                input_path = os.path.join(args.input, filename)
                error_path = os.path.join(args.error, filename)
                with open(input_path, "r") as fi, open(error_path, "w") as fe:
                    for line in fi:
                        ngram = line[:-1].split("\t")
                        try:
                            # Translate all tokens to their indices
                            ixs = tuple(map(index.s2i, ngram[:-1]))
                            # Assert that the partition is correct
                            assert (index.s2p(ngram[0]) == part)
                            # Add the ngram
                            ngrams[i] = ixs + (int(ngram[-1]), )
                            i += 1
                        # If the partition doesn't match or the token cannot be
                        # found in the index
                        except (AssertionError, KeyError):
                            fe.write(line)
                print_status("Read and indexed ngrams from", input_path)
            ngrams_n = i

            # Sort the partition
            ngrams = ngrams[:ngrams_n]
            ngrams.sort(order=["w{}".format(i) for i in range(n)])
            print_status(ngrams_n, "ngrams sorted")

            # Write lines to the binary counts file
            out_count = 0
            current_ngram = tuple()
            current_f = 0
            for i in range(ngrams_n):
                ngram_i = tuple(ngrams[i])[:-1]

                # Compare this ngram to the currently deduplicated ngram
                if ngram_i == current_ngram:
                    current_f += ngrams[i]["f"]
                else:
                    if i != 0:
                        fo.write(
                            struct.pack(fmt, *current_ngram + (current_f, )))
                        out_count += 1
                    current_ngram = ngram_i
                    current_f = ngrams[i]["f"]

                # Write a line in the last loop iteration
                if i == ngrams_n - 1:
                    fo.write(struct.pack(fmt, *current_ngram + (current_f, )))
                    out_count += 1

            print_status(out_count, "ngrams integrated and saved to",
                         output_path)