def __init__( self, token_indexers: Dict[str, TokenIndexer], human_prob: float = 1.0, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = WhitespaceTokenizer() self._token_indexers = token_indexers self._human_prob = human_prob
def __init__( self, token_indexers: Dict[str, TokenIndexer], max_sequence_length: int = None, human_prob: float = 1.0, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = WhitespaceTokenizer() self._max_sequence_length = max_sequence_length self._token_indexers = token_indexers self._human_prob = human_prob self._bert = "bert" in token_indexers
def __init__( self, token_indexers: Dict[str, TokenIndexer], add_rationale: bool = False, keep_prob: float = 1.0, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = WhitespaceTokenizer() self._token_indexers = token_indexers self._add_rationale = add_rationale self._keep_prob = keep_prob self._bert = "bert" in token_indexers
def __init__( self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = False, delimiter: str = "\t", source_max_tokens: Optional[int] = None, target_max_tokens: Optional[int] = None, source_to_target_len_max_ratio: Optional[float] = None, **kwargs, ) -> None: super().__init__(**kwargs) self._source_tokenizer = source_tokenizer or WhitespaceTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or { "tokens": SingleIdTokenIndexer() } self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token self._delimiter = delimiter self._source_max_tokens = source_max_tokens self._target_max_tokens = target_max_tokens self._source_to_target_len_max_ratio = source_to_target_len_max_ratio self._source_ignored = 0 self._target_ignored = 0 self._source_target_ratio_ignored = 0
def __init__(self, tokenizer: Optional[Tokenizer] = None, token_indexers: Optional[Dict[str, TokenIndexer]] = None, lazy: bool = False): super().__init__(lazy=lazy) self.tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
def test_load_word_pairs(self): ids1, ids2 = load_word_pairs(self.pairs_fname, WhitespaceTokenizer(), self.pairs_vocab, "tokens") # first two token IDs reserved for [CLS] and [SEP] assert torch.equal(torch.tensor([i.item() for i in ids1]), torch.arange(2, self.num_pairs + 2, step=2)) assert torch.equal(torch.tensor([i.item() for i in ids2]), torch.arange(3, self.num_pairs + 3, step=2))
def __init__(self, model_dir_path, cuda_device=-1): self._model_path = os.path.join(model_dir_path, 'segmenter_neural', 'model.tar.gz') self._cuda_device = cuda_device self.predictor = Predictor.from_path(self._model_path, cuda_device=self._cuda_device) self.predictor._tokenizer = WhitespaceTokenizer() self._separator = 'U-S' self._threshold = 0.5 self._use_logits = False self._symbol_map = SYMBOL_MAP
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, sample: int = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self._sample = sample
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, **kwargs ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
def test_load_words(self): ids = load_words(self.singles_fname, WhitespaceTokenizer(), self.singles_vocab, "tokens", all_cases=False) # first two token IDs reserved for [CLS] and [SEP] assert torch.equal(torch.tensor([i.item() for i in ids]), torch.arange(2, self.num_singles + 2))
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, **kwargs, ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or WhitespaceTokenizer() self._targets_tokenizer: Tokenizer if isinstance(self._tokenizer, PretrainedTransformerTokenizer): self._targets_tokenizer = copy.copy(self._tokenizer) self._targets_tokenizer._add_special_tokens = False else: self._targets_tokenizer = self._tokenizer self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, combine_input_fields: Optional[bool] = None, **kwargs, ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if isinstance(self._tokenizer, PretrainedTransformerTokenizer): assert not self._tokenizer._add_special_tokens if combine_input_fields is not None: self._combine_input_fields = combine_input_fields else: self._combine_input_fields = isinstance(self._tokenizer, PretrainedTransformerTokenizer)
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WhitespaceTokenizer() # temporary hack to not to add special tokens self._targets_tokenizer: Tokenizer if isinstance(self._tokenizer, PretrainedTransformerTokenizer): self._targets_tokenizer = copy.copy(self._tokenizer) self._targets_tokenizer._add_special_tokens = False else: self._targets_tokenizer = self._tokenizer self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() }
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_sequence_length: int = None, start_tokens: List[str] = None, end_tokens: List[str] = None) -> None: super().__init__() self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if max_sequence_length is not None: self._max_sequence_length: Union[ float, Optional[int]] = max_sequence_length else: self._max_sequence_length = math.inf self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])] logger.info("Creating SimpleLanguageModelingDatasetReader") logger.info("max_sequence_length=%s", max_sequence_length)
class BaseReader(DatasetReader): def __init__( self, token_indexers: Dict[str, TokenIndexer], human_prob: float = 1.0, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = WhitespaceTokenizer() self._token_indexers = token_indexers self._human_prob = human_prob @overrides def _read(self, file_path): rs = RandomState(seed=1000) with open(cached_path(file_path), "r") as data_file: for _, line in enumerate(data_file.readlines()): items = json.loads(line) document = items["document"] annotation_id = items["annotation_id"] query = items.get("query", None) label = items.get("label", None) rationale = items.get( "rationale", []) if rs.random_sample() < self._human_prob else [] if label is not None: label = str(label).replace(" ", "_") instance = self.text_to_instance( annotation_id=annotation_id, document=document, query=query, label=label, rationale=rationale, ) yield instance @overrides def text_to_instance( self, annotation_id: str, document: str, query: str = None, label: str = None, rationale: List[tuple] = None, ) -> Instance: # type: ignore # pylint: disable=arguments-differ fields = {} document_tokens = [ to_token(t.text) for t in self._tokenizer.tokenize(document) ] human_rationale_labels = [0] * len(document_tokens) for s, e in rationale: for i in range(s, e): human_rationale_labels[i] = 1 if query is not None: query_tokens = [ to_token(t.text) for t in self._tokenizer.tokenize(query) ] else: query_tokens = [] for index_name, indexer in self._token_indexers.items(): if hasattr(indexer, "add_token_info"): indexer.add_token_info(document_tokens, index_name) indexer.add_token_info(query_tokens, index_name) fields["document"] = MetadataField({ "tokens": document_tokens, "reader_object": self }) fields["query"] = MetadataField({"tokens": query_tokens}) fields["rationale"] = ArrayField(np.array(human_rationale_labels)) metadata = { "annotation_id": annotation_id, "human_rationale": rationale, "document": document, "label": label, } if query is not None: metadata["query"] = query fields["metadata"] = MetadataField(metadata) if label is not None: fields["label"] = LabelField(label, label_namespace="labels") return Instance(fields) def convert_tokens_to_instance(self, tokens: List[Token]): fields = {} tokens = tokens[0] + ( ([to_token("[DQSEP]")] + tokens[1]) if len(tokens[1]) > 0 else []) fields["document"] = TextField(tokens, self._token_indexers) return Instance(fields) def convert_documents_to_batch(self, documents: List[Tuple[List[Token], List[Token]]], vocabulary) -> Dict[str, Any]: batch = Batch( [self.convert_tokens_to_instance(tokens) for tokens in documents]) batch.index_instances(vocabulary) batch = batch.as_tensor_dict() return batch["document"] def combine_document_query(self, document: List[MetadataField], query: List[MetadataField], vocabulary): document_tokens = [(x["tokens"], y["tokens"]) for x, y in zip(document, query)] return self.convert_documents_to_batch(document_tokens, vocabulary)
class RationaleReader(DatasetReader): def __init__( self, token_indexers: Dict[str, TokenIndexer], max_sequence_length: int = None, human_prob: float = 1.0, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = WhitespaceTokenizer() self._max_sequence_length = max_sequence_length self._token_indexers = token_indexers self._human_prob = human_prob self._bert = "bert" in token_indexers @overrides def _read(self, file_path): rs = RandomState(seed=1000) with open(cached_path(file_path), "r") as data_file: for _, line in enumerate(data_file.readlines()): items = json.loads(line) document = items["document"] query = items.get("query", None) label = items.get("label", None) rationale = items.get("rationale", []) annotation_id = items["annotation_id"] if label is not None: label = str(label).replace(' ', '_') if rs.random_sample() > self._human_prob: rationale = -1 instance = self.text_to_instance(annotation_id=annotation_id, document=document, query=query, label=label, rationale=rationale) if instance is not None: yield instance @overrides def text_to_instance( self, annotation_id: str, document: str, query: str = None, label: str = None, rationale: List[tuple] = None) -> Instance: # type: ignore # pylint: disable=arguments-differ fields = {} tokens = [Token("<S>")] keep_tokens = [1] word_tokens = self._tokenizer.tokenize(document) rationale_tokens = [0] * len(word_tokens) if rationale != -1: for s, e in rationale: for i in range(s, e): rationale_tokens[i] = 1 tokens.extend(word_tokens) keep_tokens.extend([0 for _ in range(len(word_tokens))]) rationale_tokens = [0] + rationale_tokens if query is not None: if self._bert: query_tokens = self._tokenizer.tokenize(query) tokens += [Token('[SEP]')] + query_tokens keep_tokens += [1 for _ in range(len(query_tokens) + 1)] rationale_tokens += [1] * (len(query_tokens) + 1) else: fields["query"] = TextField(self._tokenizer.tokenize(query), self._token_indexers) fields["document"] = TextField(tokens, self._token_indexers) assert len(rationale_tokens) == len(tokens), breakpoint() fields['rationale'] = SequenceLabelField(rationale_tokens, fields['document'], 'rationale_labels') metadata = { "annotation_id": annotation_id, "tokens": tokens, "keep_tokens": keep_tokens, "document": document, "query": query, "convert_tokens_to_instance": self.convert_tokens_to_instance, "label": label } fields["metadata"] = MetadataField(metadata) if label is not None: fields["label"] = LabelField(label, label_namespace="labels") return Instance(fields) def convert_tokens_to_instance(self, tokens): fields = {} fields["document"] = TextField(tokens, self._token_indexers) return Instance(fields)