Exemplo n.º 1
0
        for doc in raw_data_train:
            doc['text'] = doc['text'][doc['text'].index('\n\n'):]
        for doc in raw_data_test:
            doc['text'] = doc['text'][doc['text'].index('\n\n'):]

    # keep this pseudo-ProdLDA version
    Path("replicated").mkdir(exist_ok=True)
    save_sparse(sparse.coo_matrix(raw_counts_train), "./replicated/train.npz")
    save_sparse(sparse.coo_matrix(raw_counts_test), "./replicated/test.npz")

    save_json(vocab, "./replicated/train.vocab.json")

    save_json(raw_tokens_train, "./replicated/train.tokens.json")
    save_json(raw_tokens_test, "./replicated/test.tokens.json")

    save_jsonlist(raw_data_train, "./replicated/train.jsonlist")
    save_jsonlist(raw_data_test, "./replicated/test.jsonlist")

    save_json([d['id'] for d in raw_data_train], "./replicated/train.ids.json")
    save_json([d['id'] for d in raw_data_test], "./replicated/test.ids.json")

    ## Alignment -- currently ok, but not great

    # tf-idf transform
    tfidf = TfidfTransformer()
    tfidf.fit(
        np.vstack([
            orig_counts_train, orig_counts_test, raw_counts_train,
            raw_counts_test
        ]))
    repl_train_ids = load_json("./replicated/dev/train.ids.json")
    repl_dev_ids = load_json("./replicated/dev/dev.ids.json")

    data = load_jsonlist(Path(dev_dir, "train.jsonlist"))
    counts = load_sparse(Path(dev_dir, "train.npz"))
    ids = load_json(Path(dev_dir, "train.ids.json"))

    # split based on how the replication data was split
    data_train = [doc for doc in data if doc['id'] in repl_train_ids]
    data_dev = [doc for doc in data if doc['id'] in repl_dev_ids]

    counts_train = counts[
        np.array([doc['id'] in repl_train_ids for doc in data]), :]
    counts_dev = counts[np.array([doc['id'] in repl_dev_ids
                                  for doc in data]), :]

    ids_train = [id for id in ids if id in repl_train_ids]
    ids_dev = [id for id in ids if id in repl_dev_ids]

    assert (len(data_train) == counts_train.shape[0] == len(ids_train))
    assert (len(data_dev) == counts_dev.shape[0] == len(ids_dev))

    # save
    save_jsonlist(data_train, Path(dev_dir, "train.jsonlist"))
    save_jsonlist(data_dev, Path(dev_dir, "dev.jsonlist"))

    save_sparse(counts_train, Path(dev_dir, "train.npz"))
    save_sparse(counts_dev, Path(dev_dir, "dev.npz"))

    save_json(ids_train, Path(dev_dir, "train.ids.json"))
    save_json(ids_dev, Path(dev_dir, "dev.ids.json"))
Exemplo n.º 3
0
    outdir.mkdir(exist_ok=True)

    # copy over the train files
    shutil.copy(Path(indir, "train.jsonlist"), Path(outdir, "train.jsonlist"))
    shutil.copy(Path(indir, "processed/train.npz"), Path(outdir, "train.npz"))
    shutil.copy(Path(indir, "processed/train.ids.json"),
                Path(outdir, "train.ids.json"))
    shutil.copy(Path(indir, "processed/train.vocab.json"),
                Path(outdir, "train.vocab.json"))

    # read in test
    test_jsonlist = utils.load_jsonlist(Path(indir, "test.jsonlist"))
    test_counts = utils.load_sparse(Path(indir, "processed/test.npz"))
    test_ids = utils.load_json(Path(indir, "processed/test.ids.json"))

    # split into a dev set
    dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = (
        train_test_split(test_jsonlist,
                         test_counts,
                         test_ids,
                         test_size=0.5,
                         random_state=11225))

    # save
    utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist"))
    utils.save_sparse(dev_counts, Path(outdir, "dev.npz"))
    utils.save_json(dev_ids, Path(outdir, "dev.ids.json"))

    utils.save_jsonlist(test_jsonlist, Path(outdir, "test.jsonlist"))
    utils.save_sparse(test_counts, Path(outdir, "test.npz"))
    utils.save_json(test_ids, Path(outdir, "test.ids.json"))
Exemplo n.º 4
0
    utils.save_json(vocab_list, f"{args.output_dir}/train.vocab.json")

    train_ids = list(range(len(train_doc_list)))
    val_ids = list(range(len(val_doc_list)))
    test_ids = list(range(len(test_doc_list)))

    # save ids
    utils.save_json(train_ids, f"{args.output_dir}/train.ids.json")
    utils.save_json(val_ids, f"{args.output_dir}/dev.ids.json")
    utils.save_json(test_ids, f"{args.output_dir}/test.ids.json")

    # save the raw text
    utils.save_jsonlist(
        ({
            "id": id,
            "text": text
        } for id, text in zip(train_ids, train_doc_list)),
        f"{args.output_dir}/train.jsonlist",
    )
    utils.save_jsonlist(
        ({
            "id": id,
            "text": text
        } for id, text in zip(val_ids, val_doc_list)),
        f"{args.output_dir}/dev.jsonlist",
    )
    utils.save_jsonlist(
        ({
            "id": id,
            "text": text
        } for id, text in zip(test_ids, test_doc_list)),