parser.add_argument('--vocab-file', action='store', dest='vocab_file',
                    help='vocab directory path', required=True)

args = parser.parse_args()


#
# load data & create vocab
# -------------------------------
#  
#_token_indexers = {"tokens": FastTextNGramIndexer(20)}
#_token_indexers = {"tokens": FastTextNGramIndexer(20)}
#_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}

loader = IrTripleDatasetReader(lazy=True,#token_indexers=_token_indexers,
tokenizer=BlingFireTokenizer()) #BlingFireTokenizer()) #WordTokenizer(word_splitter=JustSpacesWordSplitter()))
#,max_doc_length=200,max_query_length=20,min_doc_length=200,min_query_length=20)

instances = loader.read(args.dataset_file)
_iterator = BucketIterator(batch_size=64,
                           sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])

#vocab_map,vocab_data = FastTextVocab.load_ids(args.vocab_file,20)

#vocab = FastTextVocab(vocab_map, vocab_data,20)

_iterator.index_with(Vocabulary.from_files(args.vocab_file))

with Timer("iterate over all"):
    for i in _iterator(instances, num_epochs=1):
        exit()
parser.add_argument('--lowercase', action='store', dest='lowercase',type=bool,default=True,
                    help='bool', required=False)

parser.add_argument('--dataset-files', nargs='+', action='store', dest='dataset_files',
                    help='file format <id>\t<sequence text>', required=True)

args = parser.parse_args()


#
# load data & create vocab
# -------------------------------
#  

loader = IrTupleDatasetReader(lazy=True,source_tokenizer=BlingFireTokenizer(),target_tokenizer=BlingFireTokenizer(),lowercase=args.lowercase)

total_documents=0
all_tokens={}

for file in args.dataset_files:
    for instance in Tqdm.tqdm(loader.read(file)):

        token_set = set([tok.text.lower() for tok in instance["target_tokens"].tokens])
        for token_text in token_set:
            if token_text not in all_tokens:
                all_tokens[token_text]=0
            all_tokens[token_text]+=1

        total_documents += 1
                    action='store',
                    dest='dataset_files',
                    help='file format <id>\t<sequence text>',
                    required=True)

args = parser.parse_args()

#
# load data & create idfs
# -------------------------------
#
a, b = FastTextVocab.load_ids(args.fasttext_vocab, max_subwords=40)
fasttext_vocab = FastTextVocab(a, b, max_subwords=40)

loader = IrTupleDatasetReader(lazy=True,
                              source_tokenizer=BlingFireTokenizer(),
                              target_tokenizer=BlingFireTokenizer(),
                              lowercase=args.lowercase)

total_documents = 0
all_tokens = {}

idf = numpy.ones((args.fasttext_size, 1), dtype=numpy.float32)

for file in args.dataset_files:
    for instance in Tqdm.tqdm(loader.read(file)):

        token_set = set(
            [tok.text.lower() for tok in instance["target_tokens"].tokens])
        for token_text in token_set:
示例#4
0
def multiprocess_single_sequence_loader(process_number: int, _config,
                                        _queue: mp.Queue,
                                        _wait_for_exit: mp.Event, _local_file,
                                        _fasttext_vocab_cached_mapping,
                                        _fasttext_vocab_cached_data):

    torch.manual_seed(_config["random_seed"])
    numpy.random.seed(_config["random_seed"])
    random.seed(_config["random_seed"])

    if _config["token_embedder_type"] == "bert_cls":
        _tokenizer = BlingFireTokenizer()
        _ind = PretrainedBertIndexer(
            pretrained_model=_config["bert_pretrained_model"],
            do_lowercase=True)
        _token_indexers = {"tokens": _ind}

        _tuple_loader = IrSingleSequenceDatasetReader(
            lazy=True,
            tokenizer=_tokenizer,
            token_indexers=_token_indexers,
            max_seq_length=_config["max_doc_length"],
            min_seq_length=_config["min_doc_length"],
        )

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))

    else:
        _tokenizer = BlingFireTokenizer()

        if _config["token_embedder_type"] == "embedding":
            _token_indexers = {
                "tokens": SingleIdTokenIndexer(lowercase_tokens=True)
            }
            _vocab = Vocabulary.from_files(_config["vocab_directory"])

        elif _config["token_embedder_type"] == "fasttext":
            _token_indexers = {
                "tokens":
                FastTextNGramIndexer(_config["fasttext_max_subwords"])
            }
            _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,
                                   _fasttext_vocab_cached_data,
                                   _config["fasttext_max_subwords"])

        elif _config["token_embedder_type"] == "elmo":
            _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
            _vocab = None

        _tuple_loader = IrSingleSequenceDatasetReader(
            lazy=True,
            tokenizer=_tokenizer,
            token_indexers=_token_indexers,
            max_seq_length=_config["max_doc_length"],
            min_seq_length=_config["min_doc_length"],
        )

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(_vocab)

    for training_batch in _iterator(_tuple_loader.read(_local_file),
                                    num_epochs=1):

        _queue.put(
            training_batch)  # this moves the tensors in to shared memory

    _queue.put(None)  # signal end of queue

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait(
    )  # keep this process alive until all the shared memory is used and not needed anymore
                    required=True)

args = parser.parse_args()

max_triples = 5_000_000
max_doc_char_length = 70_000

max_doc_token_length = 800

#
# load data
# -------------------------------
#
collection = {}
collection_length = {}
tokenizer = BlingFireTokenizer()
with open(args.collection_file, "r", encoding="utf8") as collection_file:
    for line in tqdm(collection_file):
        ls = line.split("\t")  # id<\t>text ....
        _id = ls[0]
        max_char_doc = ls[1].rstrip()[:max_doc_char_length]
        collection[_id] = max_char_doc
        collection_length[_id] = len(tokenizer.tokenize(max_char_doc))

queries = {}
with open(args.query_file, "r", encoding="utf8") as query_file:
    for line in tqdm(query_file):
        ls = line.split("\t")  # id<\t>text ....
        _id = ls[0]
        queries[_id] = ls[1].rstrip()