def process_subset(items, parsed, vocab, output_dir, output_prefix): n_items = len(items) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) X = np.zeros([n_items, vocab_size], dtype=int) counter = Counter() word_counter = Counter() print("Converting to count representations") for i, words in tqdm(enumerate(parsed), total=len(parsed)): # get the vocab indices of words that are in the vocabulary indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # update the counts values = list(counter.values()) X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values # convert to a sparse representation sparse_X = sparse.csr_matrix(X) save_sparse(sparse_X, os.path.join(output_dir, 'ref.npz'))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--train-path", type=str, required=True, help="Path to the train jsonl file.") parser.add_argument("--dev-path", type=str, required=True, help="Path to the dev jsonl file.") parser.add_argument("--serialization-dir", "-s", type=str, required=True, help="Path to store the preprocessed output.") parser.add_argument( "--vocab-size", type=int, required=False, default=10000, help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument( "--tokenize", action='store_true', help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument( "--tokenizer-type", type=str, default="just_spaces", help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument( "--reference-corpus-path", type=str, required=False, help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument( "--tokenize-reference", action='store_true', help= "Path to store the preprocessed corpus vocabulary (output file name).") parser.add_argument( "--reference-tokenizer-type", type=str, default="just_spaces", help= "Path to store the preprocessed corpus vocabulary (output file name).") args = parser.parse_args() if not os.path.isdir(args.serialization_dir): os.mkdir(args.serialization_dir) vocabulary_dir = os.path.join(args.serialization_dir, "vocabulary") if not os.path.isdir(vocabulary_dir): os.mkdir(vocabulary_dir) tokenized_train_examples = load_data(args.train_path, args.tokenize, args.tokenizer_type) tokenized_dev_examples = load_data(args.dev_path, args.tokenize, args.tokenizer_type) print("fitting count vectorizer...") count_vectorizer = CountVectorizer(stop_words='english', max_features=args.vocab_size, token_pattern=r'\b\w+|:|\.|\[|\]\b') text = tokenized_train_examples + tokenized_dev_examples count_vectorizer.fit(tqdm(text)) vectorized_train_examples = count_vectorizer.transform( tqdm(tokenized_train_examples)) vectorized_dev_examples = count_vectorizer.transform( tqdm(tokenized_dev_examples)) reference_vectorizer = CountVectorizer(stop_words='english', token_pattern=r'\b\w+|:|\.|\[|\]\b') if not args.reference_corpus_path: print("fitting reference corpus using development data...") reference_matrix = reference_vectorizer.fit_transform( tqdm(tokenized_dev_examples)) else: print(f"loading reference corpus at {args.reference_corpus_path}...") reference_examples = load_data(args.reference_corpus_path, args.tokenize_reference, args.reference_tokenizer_type) print("fitting reference corpus...") reference_matrix = reference_vectorizer.fit_transform( tqdm(reference_examples)) reference_vocabulary = reference_vectorizer.get_feature_names() # add @@unknown@@ token vector vectorized_train_examples = sparse.hstack( (np.array([0] * len(tokenized_train_examples))[:, None], vectorized_train_examples)) vectorized_dev_examples = sparse.hstack( (np.array([0] * len(tokenized_dev_examples))[:, None], vectorized_dev_examples)) master = sparse.vstack( [vectorized_train_examples, vectorized_dev_examples]) # generate background frequency print("generating background frequency...") bgfreq = dict( zip(count_vectorizer.get_feature_names(), (np.array(master.sum(0)) / args.vocab_size).squeeze())) print("saving data...") save_sparse(vectorized_train_examples, os.path.join(args.serialization_dir, "train.npz")) save_sparse(vectorized_dev_examples, os.path.join(args.serialization_dir, "dev.npz")) if not os.path.isdir(os.path.join(args.serialization_dir, "reference")): os.mkdir(os.path.join(args.serialization_dir, "reference")) save_sparse(reference_matrix, os.path.join(args.serialization_dir, "reference", "ref.npz")) write_to_json( reference_vocabulary, os.path.join(args.serialization_dir, "reference", "ref.vocab.json")) write_to_json(bgfreq, os.path.join(args.serialization_dir, "vampire.bgfreq")) write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(), os.path.join(vocabulary_dir, "vampire.txt")) write_list_to_file(['*tags', '*labels', 'vampire'], os.path.join(vocabulary_dir, "non_padded_namespaces.txt"))