parser.add_argument('--vocab-file', action='store', dest='vocab_file', help='vocab directory path', required=True) args = parser.parse_args() # # load data & create vocab # ------------------------------- # #_token_indexers = {"tokens": FastTextNGramIndexer(20)} #_token_indexers = {"tokens": FastTextNGramIndexer(20)} #_token_indexers = {"tokens": ELMoTokenCharactersIndexer()} loader = IrTripleDatasetReader(lazy=True,#token_indexers=_token_indexers, tokenizer=BlingFireTokenizer()) #BlingFireTokenizer()) #WordTokenizer(word_splitter=JustSpacesWordSplitter())) #,max_doc_length=200,max_query_length=20,min_doc_length=200,min_query_length=20) instances = loader.read(args.dataset_file) _iterator = BucketIterator(batch_size=64, sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")]) #vocab_map,vocab_data = FastTextVocab.load_ids(args.vocab_file,20) #vocab = FastTextVocab(vocab_map, vocab_data,20) _iterator.index_with(Vocabulary.from_files(args.vocab_file)) with Timer("iterate over all"): for i in _iterator(instances, num_epochs=1): exit()
parser.add_argument('--lowercase', action='store', dest='lowercase',type=bool,default=True, help='bool', required=False) parser.add_argument('--dataset-files', nargs='+', action='store', dest='dataset_files', help='file format <id>\t<sequence text>', required=True) args = parser.parse_args() # # load data & create vocab # ------------------------------- # loader = IrTupleDatasetReader(lazy=True,source_tokenizer=BlingFireTokenizer(),target_tokenizer=BlingFireTokenizer(),lowercase=args.lowercase) total_documents=0 all_tokens={} for file in args.dataset_files: for instance in Tqdm.tqdm(loader.read(file)): token_set = set([tok.text.lower() for tok in instance["target_tokens"].tokens]) for token_text in token_set: if token_text not in all_tokens: all_tokens[token_text]=0 all_tokens[token_text]+=1 total_documents += 1
action='store', dest='dataset_files', help='file format <id>\t<sequence text>', required=True) args = parser.parse_args() # # load data & create idfs # ------------------------------- # a, b = FastTextVocab.load_ids(args.fasttext_vocab, max_subwords=40) fasttext_vocab = FastTextVocab(a, b, max_subwords=40) loader = IrTupleDatasetReader(lazy=True, source_tokenizer=BlingFireTokenizer(), target_tokenizer=BlingFireTokenizer(), lowercase=args.lowercase) total_documents = 0 all_tokens = {} idf = numpy.ones((args.fasttext_size, 1), dtype=numpy.float32) for file in args.dataset_files: for instance in Tqdm.tqdm(loader.read(file)): token_set = set( [tok.text.lower() for tok in instance["target_tokens"].tokens]) for token_text in token_set:
def multiprocess_single_sequence_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file, _fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data): torch.manual_seed(_config["random_seed"]) numpy.random.seed(_config["random_seed"]) random.seed(_config["random_seed"]) if _config["token_embedder_type"] == "bert_cls": _tokenizer = BlingFireTokenizer() _ind = PretrainedBertIndexer( pretrained_model=_config["bert_pretrained_model"], do_lowercase=True) _token_indexers = {"tokens": _ind} _tuple_loader = IrSingleSequenceDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_seq_length=_config["max_doc_length"], min_seq_length=_config["min_doc_length"], ) _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]), sorting_keys=[("seq_tokens", "num_tokens")]) _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"])) else: _tokenizer = BlingFireTokenizer() if _config["token_embedder_type"] == "embedding": _token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True) } _vocab = Vocabulary.from_files(_config["vocab_directory"]) elif _config["token_embedder_type"] == "fasttext": _token_indexers = { "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"]) } _vocab = FastTextVocab(_fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data, _config["fasttext_max_subwords"]) elif _config["token_embedder_type"] == "elmo": _token_indexers = {"tokens": ELMoTokenCharactersIndexer()} _vocab = None _tuple_loader = IrSingleSequenceDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_seq_length=_config["max_doc_length"], min_seq_length=_config["min_doc_length"], ) _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]), sorting_keys=[("seq_tokens", "num_tokens")]) _iterator.index_with(_vocab) for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1): _queue.put( training_batch) # this moves the tensors in to shared memory _queue.put(None) # signal end of queue _queue.close() # indicate this local thread is done _wait_for_exit.wait( ) # keep this process alive until all the shared memory is used and not needed anymore
required=True) args = parser.parse_args() max_triples = 5_000_000 max_doc_char_length = 70_000 max_doc_token_length = 800 # # load data # ------------------------------- # collection = {} collection_length = {} tokenizer = BlingFireTokenizer() with open(args.collection_file, "r", encoding="utf8") as collection_file: for line in tqdm(collection_file): ls = line.split("\t") # id<\t>text .... _id = ls[0] max_char_doc = ls[1].rstrip()[:max_doc_char_length] collection[_id] = max_char_doc collection_length[_id] = len(tokenizer.tokenize(max_char_doc)) queries = {} with open(args.query_file, "r", encoding="utf8") as query_file: for line in tqdm(query_file): ls = line.split("\t") # id<\t>text .... _id = ls[0] queries[_id] = ls[1].rstrip()