Пример #1
0
def process_subset(items, parsed, vocab, output_dir, output_prefix):
    n_items = len(items)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    X = np.zeros([n_items, vocab_size], dtype=int)

    counter = Counter()
    word_counter = Counter()
    print("Converting to count representations")
    for i, words in tqdm(enumerate(parsed), total=len(parsed)):
        # get the vocab indices of words that are in the vocabulary
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # update the counts
            values = list(counter.values())
            X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    save_sparse(sparse_X, os.path.join(output_dir, 'ref.npz'))
Пример #2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--train-path",
                        type=str,
                        required=True,
                        help="Path to the train jsonl file.")
    parser.add_argument("--dev-path",
                        type=str,
                        required=True,
                        help="Path to the dev jsonl file.")
    parser.add_argument("--serialization-dir",
                        "-s",
                        type=str,
                        required=True,
                        help="Path to store the preprocessed output.")
    parser.add_argument(
        "--vocab-size",
        type=int,
        required=False,
        default=10000,
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument(
        "--tokenize",
        action='store_true',
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument(
        "--tokenizer-type",
        type=str,
        default="just_spaces",
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument(
        "--reference-corpus-path",
        type=str,
        required=False,
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument(
        "--tokenize-reference",
        action='store_true',
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument(
        "--reference-tokenizer-type",
        type=str,
        default="just_spaces",
        help=
        "Path to store the preprocessed corpus vocabulary (output file name).")
    args = parser.parse_args()

    if not os.path.isdir(args.serialization_dir):
        os.mkdir(args.serialization_dir)

    vocabulary_dir = os.path.join(args.serialization_dir, "vocabulary")

    if not os.path.isdir(vocabulary_dir):
        os.mkdir(vocabulary_dir)

    tokenized_train_examples = load_data(args.train_path, args.tokenize,
                                         args.tokenizer_type)
    tokenized_dev_examples = load_data(args.dev_path, args.tokenize,
                                       args.tokenizer_type)

    print("fitting count vectorizer...")

    count_vectorizer = CountVectorizer(stop_words='english',
                                       max_features=args.vocab_size,
                                       token_pattern=r'\b\w+|:|\.|\[|\]\b')

    text = tokenized_train_examples + tokenized_dev_examples

    count_vectorizer.fit(tqdm(text))

    vectorized_train_examples = count_vectorizer.transform(
        tqdm(tokenized_train_examples))
    vectorized_dev_examples = count_vectorizer.transform(
        tqdm(tokenized_dev_examples))

    reference_vectorizer = CountVectorizer(stop_words='english',
                                           token_pattern=r'\b\w+|:|\.|\[|\]\b')
    if not args.reference_corpus_path:
        print("fitting reference corpus using development data...")
        reference_matrix = reference_vectorizer.fit_transform(
            tqdm(tokenized_dev_examples))
    else:
        print(f"loading reference corpus at {args.reference_corpus_path}...")
        reference_examples = load_data(args.reference_corpus_path,
                                       args.tokenize_reference,
                                       args.reference_tokenizer_type)
        print("fitting reference corpus...")
        reference_matrix = reference_vectorizer.fit_transform(
            tqdm(reference_examples))

    reference_vocabulary = reference_vectorizer.get_feature_names()

    # add @@unknown@@ token vector
    vectorized_train_examples = sparse.hstack(
        (np.array([0] * len(tokenized_train_examples))[:, None],
         vectorized_train_examples))
    vectorized_dev_examples = sparse.hstack(
        (np.array([0] * len(tokenized_dev_examples))[:, None],
         vectorized_dev_examples))
    master = sparse.vstack(
        [vectorized_train_examples, vectorized_dev_examples])

    # generate background frequency
    print("generating background frequency...")
    bgfreq = dict(
        zip(count_vectorizer.get_feature_names(),
            (np.array(master.sum(0)) / args.vocab_size).squeeze()))

    print("saving data...")
    save_sparse(vectorized_train_examples,
                os.path.join(args.serialization_dir, "train.npz"))
    save_sparse(vectorized_dev_examples,
                os.path.join(args.serialization_dir, "dev.npz"))
    if not os.path.isdir(os.path.join(args.serialization_dir, "reference")):
        os.mkdir(os.path.join(args.serialization_dir, "reference"))
    save_sparse(reference_matrix,
                os.path.join(args.serialization_dir, "reference", "ref.npz"))
    write_to_json(
        reference_vocabulary,
        os.path.join(args.serialization_dir, "reference", "ref.vocab.json"))
    write_to_json(bgfreq, os.path.join(args.serialization_dir,
                                       "vampire.bgfreq"))

    write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(),
                       os.path.join(vocabulary_dir, "vampire.txt"))
    write_list_to_file(['*tags', '*labels', 'vampire'],
                       os.path.join(vocabulary_dir,
                                    "non_padded_namespaces.txt"))