Пример #1
0
 def check_coocc(self, output, copies_number=COPIES_NUMBER):
     coocc = Cooccurrences().load(models.COOCC)
     res = Cooccurrences().load(output)
     self.assertEqual(len(res.tokens), len(coocc.tokens))
     permutation = [coocc.tokens.index(token) for token in res.tokens]
     self.assertTrue(
         numpy.all(res.matrix.todense() == copies_number *
                   coocc.matrix.todense()[permutation][:, permutation]))
Пример #2
0
    def test_overflow_with_spark(self):
        with tempfile.TemporaryDirectory(
                prefix="merge-coocc-entry-test") as input_dir:
            self.copy_models(models.COOCC, input_dir, COPIES_NUMBER)
            args = get_args(input_dir, False)
            c_neg = Cooccurrences().load(args.input[0])
            c_neg.matrix.data[0] = MAX_INT32 - c_neg.matrix.data[0]
            c_neg.save(args.input[0])
            merge_coocc(args)

            result = Cooccurrences().load(args.output)
            self.assertTrue(numpy.all(result.matrix.data <= MAX_INT32))
            self.assertTrue(numpy.all(result.matrix.data >= 0))
Пример #3
0
def merge_coocc_no_spark(df, filepaths, log, args):
    """
    Algorithm explanation:

    1. Although we store result in uint32, we actually never have elements greater than MAX_INT32
    2. We assume that both result and the summed matrix do not have elements greater than MAX_INT32
    3. As soon as we have a value bigger than MAX_INT32 after summing, we saturate
    4. Thus we lose 2x data range but do not allocate any additional memory and it works faster
       than MAX_UINT32 checks
    5. Only ? elements saturate in PGA so this is fine

    """
    # TODO(zurk): recheck the number of saturated elements.
    log.info("Merging cooccurrences without using PySpark")
    shape = (len(df) + 1, ) * 2
    result = coo_matrix(shape, dtype=np.uint32)
    for path, coocc in load_and_check(filepaths, log):
        coocc._matrix = coo_matrix(coocc._matrix)
        index = [df.order.get(x, len(df)) for x in coocc.tokens]
        rows = [index[x] for x in coocc.matrix.row]
        cols = [index[x] for x in coocc.matrix.col]
        result += coo_matrix((coocc.matrix.data, (rows, cols)),
                             shape=shape,
                             dtype=np.uint32)
        indx_overflow = np.where(result.data > MAX_INT32)
        if indx_overflow[0].size > 0:
            log.warning(
                "Overflow in %d elements. They will be saturated to MAX_INT32",
                indx_overflow[0].size)
            result.data[indx_overflow] = MAX_INT32
    Cooccurrences() \
        .construct(df.tokens(), result[:-1, :-1]) \
        .save(args.output, (df,))
Пример #4
0
    def test_load_and_check(self):
        with tempfile.TemporaryDirectory(
                prefix="merge-coocc-entry-test") as input_dir:
            self.copy_models(models.COOCC, input_dir, COPIES_NUMBER)
            args = get_args(input_dir, True)
            c_neg = Cooccurrences().load(args.input[0])
            c_neg.matrix.data[0] = -1
            c_neg.save(args.input[0])
            self.assertEqual(
                len(list(load_and_check(args.input,
                                        logging.getLogger("test")))), 2)

            c_neg = Cooccurrences().load(args.input[0])
            c_neg.matrix.data = numpy.uint32(c_neg.matrix.data)
            c_neg.matrix.data[0] = MAX_INT32 + 1
            c_neg.save(args.input[0])
            for _, coocc in load_and_check(args.input,
                                           logging.getLogger("test")):
                self.assertTrue(numpy.all(coocc.matrix.data <= MAX_INT32))
                break
Пример #5
0
def load_and_check(filepaths: list, log: logging.Logger):
    """
    Load Cooccurrences models from filepaths list and perform simple check:
    1. If model contains values more than MAX_INT32 we saturate.
    2. If model contains negative values we consider it as corrupted, report and skip.
    """
    for path in progress_bar(filepaths, log):
        coocc = Cooccurrences().load(path)
        negative_values = np.where(coocc.matrix.data < 0)
        if negative_values[0].size > 0:
            log.warning("Model %s is corrupted and will be skipped. "
                        "It contains negative elements.", path)
            continue
        too_big_values = np.where(coocc.matrix.data > MAX_INT32)
        if too_big_values[0].size > 0:
            log.warning("Model %s contains elements with values more than MAX_INT32. "
                        "They will be saturated to MAX_INT32", path)
            coocc.matrix.data[too_big_values] = MAX_INT32
        yield path, coocc
Пример #6
0
    def __call__(self, sparse_matrix: PipelinedRDD):
        """
        Saves Cooccurrences asdf model to disk.

        :param sparse_matrix: rdd with 3 columns: matrix row, matrix column,  cell value. Use
            :class:`.CooccConstructor` to construct RDD from uasts.
        :return:
        """
        rows = sparse_matrix.collect()

        mat_row, mat_col, mat_weights = zip(*rows)
        tokens_num = len(self.tokens_list)

        self._log.info("Building matrix...")
        matrix = sparse.coo_matrix((mat_weights, (mat_row, mat_col)),
                                   shape=(tokens_num, tokens_num))
        Cooccurrences() \
            .construct(self.tokens_list, matrix) \
            .save(self.output, deps=(self.df_model,))
Пример #7
0
def id2vec_preprocess(args):
    """
    Loads co-occurrence matrices for several repositories and generates the
    document frequencies and the Swivel protobuf dataset.

    :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \
                 "shard_size", "df" and "output".
    :return: None
    """
    log = logging.getLogger("preproc")
    log.info("Loading docfreq model from %s", args.docfreq_in)
    df_model = DocumentFrequencies(log_level=args.log_level).load(
        source=args.docfreq_in)
    coocc_model = Cooccurrences().load(args.input)
    if numpy.any(coocc_model.matrix.data < 0):
        raise ValueError(
            ("Co-occurrence matrix %s contains negative elements. "
             "Please check its correctness.") % args.input)
    if numpy.any(numpy.isnan(coocc_model.matrix.data)):
        raise ValueError(("Co-occurrence matrix %s contains nan elements. "
                          "Please check its correctness.") % args.input)

    try:
        df_meta = coocc_model.get_dep(DocumentFrequencies.NAME)
        if df_model.meta != df_meta:
            raise ValueError((
                "Document frequency model you provided does not match dependency inside "
                "Cooccurrences model:\nargs.docfreq.meta:\n%s\ncoocc_model.get_dep"
                "(\"docfreq\")\n%s\n") % (df_model.meta, df_meta))
    except KeyError:
        pass  # There is no docfreq dependency

    vs = args.vocabulary_size
    if len(df_model) < vs:
        vs = len(df_model)
    sz = args.shard_size
    if vs < sz:
        raise ValueError(
            "vocabulary_size=%s is less than shard_size=%s. You should specify a smaller "
            "shard_size (e.g. shard_size=%s)." % (vs, sz, vs))
    vs -= vs % sz
    log.info("Effective vocabulary size: %d", vs)
    df_model = df_model.greatest(vs)
    log.info("Sorting the vocabulary...")
    chosen_words = sorted(df_model.tokens())
    word_indices = {w: i for i, w in enumerate(chosen_words)}

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    with open(os.path.join(args.output, "row_vocab.txt"), "w") as out:
        out.write('\n'.join(chosen_words))
    log.info("Saved row_vocab.txt")
    shutil.copyfile(os.path.join(args.output, "row_vocab.txt"),
                    os.path.join(args.output, "col_vocab.txt"))
    log.info("Saved col_vocab.txt")
    del chosen_words

    ccmatrix = extract_coocc_matrix((vs, vs), word_indices, coocc_model)

    log.info("Planning the sharding...")
    bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1]
    reorder = numpy.argsort(-bool_sums)
    with open(os.path.join(args.output, "row_sums.txt"), "w") as out:
        out.write('\n'.join(map(str, bool_sums.tolist())))
    log.info("Saved row_sums.txt")
    shutil.copyfile(os.path.join(args.output, "row_sums.txt"),
                    os.path.join(args.output, "col_sums.txt"))
    log.info("Saved col_sums.txt")

    log.info("Writing the shards...")
    os.makedirs(args.output, exist_ok=True)
    nshards = vs // args.shard_size
    for row in progress_bar(range(nshards), log, expected_size=nshards):
        for col in range(nshards):
            indices_row = reorder[row::nshards]
            indices_col = reorder[col::nshards]
            shard = ccmatrix[indices_row][:, indices_col].tocoo()

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    "global_row": _int64s(indices_row),
                    "global_col": _int64s(indices_col),
                    "sparse_local_row": _int64s(shard.row),
                    "sparse_local_col": _int64s(shard.col),
                    "sparse_value": _floats(shard.data)
                }))

            with open(
                    os.path.join(args.output,
                                 "shard-%03d-%03d.pb" % (row, col)),
                    "wb") as out:
                out.write(example.SerializeToString())
    log.info("Success")
Пример #8
0
 def setUp(self):
     self.model = Cooccurrences().load(source=paths.COOCC)