def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, unrestricted_tokenizer: Tokenizer = None, segment_sentences: bool = False, sequence_length: int = None, ignore_labels: bool = False, skip_label_indexing: bool = False, sample: int = None, unlabeled_data_path: str = None, lazy: bool = False) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or WordTokenizer() self._unrestricted_tokenizer = unrestricted_tokenizer self._sample = sample self._segment_sentences = segment_sentences self._sequence_length = sequence_length self._ignore_labels = ignore_labels self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._unlabeled_data_path = unlabeled_data_path if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
def test_read_from_file_reuters_corpus_and_segments_sentences_properly( self, lazy, max_sequence_length): reader = MultiLabelTextClassificationJsonReader( lazy=lazy, segment_sentences=True, max_sequence_length=max_sequence_length) reuters_path = Path( "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl" instances = reader.read(reuters_path) instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) text1 = ( "U.K. GROWING IMPATIENT WITH JAPAN - THATCHER Prime Minister Margaret Thatcher said the" " U.K. Was growing more impatient with Japanese trade barriers and warned that it would" " soon have new powers against countries not offering reciprocal access to their" " markets.") instance1 = {"text": text1, "labels": ["acq", "trade"]} text2 = ( "CANADA OIL EXPORTS RISE 20 PCT IN 1986 Canadian oil exports rose 20 pct in 1986 over" " the previous year to 33.96 mln cubic meters, while oil imports soared 25.2 pct to" " 20.58 mln cubic meters, Statistics Canada said. Production, meanwhile, was unchanged" " from the previous year at 91.09 mln cubic feet.") instance2 = {"text": text2, "labels": ["nat-gas", "crude"]} text3 = ( "COFFEE, SUGAR AND COCOA EXCHANGE NAMES CHAIRMAN The New York Coffee, Sugar and Cocoa" " Exchange (CSCE) elected former first vice chairman Gerald Clancy to a two-year term" " as chairman of the board of managers, replacing previous chairman Howard Katz. Katz," " chairman since 1985, will remain a board member.") instance3 = {"text": text3, "labels": ["sugar", "cocoa", "coffee"]} for instance in [instance1, instance2, instance3]: sentences = splitter.split_sentences(instance["text"]) tokenized_sentences: List[List[str]] = [] for sentence in sentences: tokens = [token.text for token in spacy_tokenizer(sentence)] if max_sequence_length: tokens = tokens[:max_sequence_length] tokenized_sentences.append(tokens) instance["tokens"] = tokenized_sentences assert len(instances) == 3 fields = instances[0].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance1["tokens"] assert fields["labels"].labels == instance1["labels"] fields = instances[1].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance2["tokens"] assert fields["labels"].labels == instance2["labels"] fields = instances[2].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance3["tokens"] assert fields["labels"].labels == instance3["labels"]
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, text_key: str = "text", label_key: str = "label", **kwargs, ) -> None: super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._text_key = text_key self._label_key = label_key if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
def test_read_from_file_ag_news_corpus_and_segments_sentences_properly( self, lazy: bool, label_name: str, max_sequence_length: Optional[int]): reader = TextSentimentReader(lazy=lazy, segment_sentences=True, label_name=label_name, max_sequence_length=max_sequence_length) ag_path = Path(DATA_DIR, 'ag_news_corpus_original.jsonl') if label_name == 'text_sentiment': ag_path = Path(DATA_DIR, 'ag_news_corpus.jsonl') instances = reader.read(ag_path) instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) text1 = ("Memphis Rout Still Stings for No. 14 Louisville; Coach " "Petrino Vows to Have Team Better Prepared. NASHVILLE, " "Tenn. Nov 3, 2004 - Louisville #39;s 30-point loss " "at home to Memphis last season is still a painful memory " "for the Cardinals.") instance1 = {"text": text1, "label": "2"} text2 = ("AP - Eli Manning has replaced Kurt Warner as the New York" " Giants' starting quarterback.") instance2 = {"text": text2, "label": "2"} text3 = ("A conference dedicated to online journalism explores the " "effect blogs have on news reporting. Some say they draw " "attention to under-reported stories. Others struggle to " "establish the credibility enjoyed by professionals.") instance3 = {"text": text3, "label": "4"} for instance in [instance1, instance2, instance3]: sentences = splitter.split_sentences(instance['text']) tokenized_sentences: List[List[str]] = [] for sentence in sentences: tokens = [token.text for token in spacy_tokenizer(sentence)] if max_sequence_length: tokens = tokens[:max_sequence_length] tokenized_sentences.append(tokens) instance["tokens"] = tokenized_sentences assert len(instances) == 3 fields = instances[0].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance2["tokens"] assert fields["label"].label == instance2["label"] fields = instances[2].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance3["tokens"] assert fields["label"].label == instance3["label"]
def __init__(self, lazy: bool = True, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, split_sentence_in_doc: bool = False): super().__init__(lazy) self.tokenizer = tokenizer or WordTokenizer() self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} if split_sentence_in_doc: self.sentence_splitter = SpacySentenceSplitter() else: self.sentence_splitter = None
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._sentence_splitter = SpacySentenceSplitter() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.cache_data(os.path.expanduser('~/.allennlp/cache/datasets'))
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, passage_length_limit: int = None, question_length_limit: int = None, skip_invalid_examples: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._sentence_splitter = SpacySentenceSplitter() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.passage_length_limit = passage_length_limit self.question_length_limit = question_length_limit self.skip_invalid_examples = skip_invalid_examples
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, segment_sentences: bool = True, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( word_splitter=SpacyWordSplitter(pos_tags=True), word_stemmer=PorterStemmer()) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if segment_sentences: self._segment_sentences = SpacySentenceSplitter() self._class_cnt = defaultdict(int)
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, max_sequence_length: int = None, ignore_labels: bool = False, sample: int = None, skip_label_indexing: bool = False, lazy: bool = False) -> None: super().__init__(lazy=lazy, token_indexers=token_indexers, tokenizer=tokenizer, max_sequence_length=max_sequence_length, skip_label_indexing=skip_label_indexing) self._tokenizer = tokenizer or WordTokenizer() self._sample = sample self._max_sequence_length = max_sequence_length self._ignore_labels = ignore_labels self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() self.label_order = [ 'External', 'Cardio', 'Cancer', 'Stroke', 'TB/AIDS', 'Other NCD', 'Other Comm', 'Pneumonia', 'Renal', 'Maternal', 'Diabetes', 'Liver' ]
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
class DoGDatasetReader(DatasetReader): def __init__(self, lazy: bool = True, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, split_sentence_in_doc: bool = False): super().__init__(lazy) self.tokenizer = tokenizer or WordTokenizer() self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} if split_sentence_in_doc: self.sentence_splitter = SpacySentenceSplitter() else: self.sentence_splitter = None @overrides def _read(self, file_path: str) -> Iterable[Instance]: with open(os.path.join(os.path.split(file_path)[0], 'documents.json'), 'r') as doc_file: doc_json = json.load(doc_file) doc_field_dict = self.get_doc_field_dict(doc_json) with open(file_path, 'r') as data_file: for line in data_file: line = line.strip() dialog_json = json.loads(line) yield self.text_to_instance(dialog_json['dialogue'], doc_field_dict[dialog_json['docId']], dialog_json['whoSawDoc']) @overrides def text_to_instance(self, dialogs: List[str], doc_field: Field, who_saw_doc: int): tokenized_dialogs = [self.tokenizer.tokenize(dialog) for dialog in dialogs] for tokenized_dialog in tokenized_dialogs: tokenized_dialog.insert(0, Token(START_SYMBOL)) tokenized_dialog.append(Token(END_SYMBOL)) dialogue_field = ListField([TextField(tokenized_dialog, self.token_indexers) for tokenized_dialog in tokenized_dialogs]) # who_saw_doc_field = MetadataField(who_saw_doc) # return Instance({'dialogue': dialogue_field, 'document': doc_field, 'who_saw_doc': who_saw_doc_field}) return Instance({'dialogue': dialogue_field, 'document': doc_field}) def get_doc_field_dict(self, doc_json: Dict) -> Dict[int, Field]: doc_field_dict = {} for idx, doc in doc_json.items(): if self.sentence_splitter is not None: doc_sentence_list: List[str] = [] for i in ('0', '1', '2', '3'): doc_sentence_list.extend(self.sentence_splitter.split_sentences(doc[i])) tokenized_doc_sentence_list = [self.tokenizer.tokenize(doc_sequence) for doc_sequence in doc_sentence_list] doc_field = ListField([TextField(tokenized_doc_sentence, self.token_indexers) for tokenized_doc_sentence in tokenized_doc_sentence_list]) else: doc_sequence = ' '.join(doc[i] for i in ('0', '1', '2', '3')) tokenized_doc = self.tokenizer.tokenize(doc_sequence) doc_field = TextField(tokenized_doc, self.token_indexers) doc_field_dict[int(idx)] = doc_field return doc_field_dict
class TWTCDatasetReader(DatasetReader): """ Reads a JSON file from the TWTC dataset. Expected format for each input line: {"report": "text", "label": "int"} The output of ``read`` is a list of ``Instance`` s with the fields: text: ``TextField`` label: ``LabelField`` Parameters ---------- lazy : ``bool`` (optional, default=False) Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will take longer per batch. This also allows training with datasets that are too large to fit in memory. tokenizer : ``Tokenizer``, optional Tokenizer to use to split the title and abstrct into words or other kinds of tokens. Defaults to ``WordTokenizer()``. token_indexers : ``Dict[str, TokenIndexer]``, optional Indexers used to define input token representations. Defaults to ``{"tokens": SingleIdTokenIndexer()}``. """ def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._sentence_splitter = SpacySentenceSplitter() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.cache_data(os.path.expanduser('~/.allennlp/cache/datasets')) @overrides def _read(self, file_path): file_path = cached_path(file_path) data = pd.read_json(file_path, lines=True, orient='records')[['text', 'label']].values for text, label in data: assert isinstance(label, int) inst = self.text_to_instance(text, str(label)) yield inst @overrides def text_to_instance(self, document: str, label: str = None) -> Instance: sentences: List[str] = self._sentence_splitter.split_sentences( document) tokenized_sents: List[List[str]] = (self._tokenizer.tokenize(sent) for sent in sentences) fields = { 'tokens': ListField( [TextField(s, self._token_indexers) for s in tokenized_sents]) } if label: fields['label'] = LabelField(int(label), skip_indexing=True) return Instance(fields)
def entity_extraction_wikihop(args): predictor_conll = AllenNER( "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") predictor_onto_note = \ AllenNER("https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz") sentence_splitter = SpacySentenceSplitter(rule_based=True) with open(args.path, 'r') as f: data = json.load(f) for d in tqdm(data): golden_ners = [] passage = [] question = d['query'].strip().replace("\n", "") question_entity = " ".join(question.split()[1:]) question = " ".join(question.split("_")) for para in d['supports']: sentences = sentence_splitter.split_sentences(para) para_ners = [] outputs_conll = predictor_conll.predict_batch_raw(sentences) outputs_onto_note = predictor_onto_note.predict_batch_raw(sentences) for out1, out2 in zip(outputs_conll, outputs_onto_note): entities1 = entity_extraction_(out1['words'], out1['tags']) entities2 = entity_extraction_(out2['words'], out2['tags']) entities = set(entities1).union(set(entities2)) # print(entities) para_ners.append(list(entities)) golden_ners.append(para_ners) passage.append(sentences) # parsing_info.append([title, outputs_conll]) # print(question) # print(question_entity) # input() d['supports'] = passage d['question_entities'] = [question_entity] d['ners'] = golden_ners d['query'] = question # input() with open(args.output, 'w') as f: json.dump(data, f)
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, max_pieces: int = 512, max_count: int = 10, max_spans: int = 10, max_numbers_expression: int = 2, answer_type: List[str] = None, use_validated: bool = True, wordpiece_numbers: bool = True, number_tokenizer: Tokenizer = None, sentence_tokenizer: Tokenizer = None, custom_word_to_num: bool = True, exp_search: str = 'add_sub', max_depth: int = 3, extra_numbers: List[float] = []): super(BertDropReader, self).__init__(lazy) self.tokenizer = tokenizer self.token_indexers = token_indexers self.max_pieces = max_pieces self.max_count = max_count self.max_spans = max_spans self.max_numbers_expression = max_numbers_expression self.answer_type = answer_type self.use_validated = use_validated self.wordpiece_numbers = wordpiece_numbers self.number_tokenizer = number_tokenizer or WordTokenizer() self.sentence_tokenizer = sentence_tokenizer or SpacySentenceSplitter() self.exp_search = exp_search self.max_depth = max_depth self.extra_numbers = extra_numbers self.op_dict = {'+': operator.add, '-': operator.sub, '*': operator.mul, '/': operator.truediv} self.operations = list(enumerate(self.op_dict.keys())) self.templates = [lambda x,y,z: (x + y) * z, lambda x,y,z: (x - y) * z, lambda x,y,z: (x + y) / z, lambda x,y,z: (x - y) / z, lambda x,y,z: x * y / z] self.template_strings = ['(%s + %s) * %s', '(%s - %s) * %s', '(%s + %s) / %s', '(%s - %s) / %s', '%s * %s / %s',] if custom_word_to_num: self.word_to_num = get_number_from_word else: self.word_to_num = DropReader.convert_word_to_number
class TestSentenceSplitter(AllenNlpTestCase): def setUp(self): super().setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_dep_parse_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [ self.rule_based_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [ self.dep_parse_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence
class TestSentenceSplitter(AllenNlpTestCase): def setUp(self): super().setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens @pytest.mark.skipif(spacy.__version__ < "2.1", reason="this model changed from 2.0 to 2.1") def test_dep_parse_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, max_sequence_length: int = None, sample: int = None, skip_label_indexing: bool = False, lazy: bool = False) -> None: super().__init__(lazy=lazy, token_indexers=token_indexers, tokenizer=tokenizer, max_sequence_length=max_sequence_length, skip_label_indexing=skip_label_indexing) self._tokenizer = tokenizer or WordTokenizer() self._sample = sample self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
class TestSentenceSplitter(AllenNlpTestCase): def setUp(self): super(TestSentenceSplitter, self).setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens def test_dep_parse_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence -", "yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence
class ExampleLoader(object): def __init__(self): self.label_list = None self.sentence_splitter = SpacySentenceSplitter() def get_loss_weights(): # Calculate loss weights as the inverse of label occurrence. loss_weights = {} for label in self.label_list: loss_weights[label] = 0 for ex in train_examples: loss_weights[ex.str_label] += 1 num_examples = len(train_examples) for key in loss_weights: loss_weights[key] = num_examples / loss_weights[key] weights_list = [ float("%3.f" % loss_weights[key]) for key in loader.label_list ] return weights_list def get_text_from_element(self, node): if node.nodeType == node.TEXT_NODE: if node.data.isspace(): return "" else: return node.data.replace("\n", " ") else: text = "" for child in node.childNodes: text += " " + self.get_text_from_element(child) + " " return text def process_node(self, node, events, times, full_text): if node.nodeName == "EVENT": eid = node.attributes['eid'].value cls = node.attributes['class'].value event = Event(eid=eid, cls=cls, sentence=None, pos_in_sentence=None) event.idx_in_doc = len(full_text) events[eid] = event return event if node.nodeName == "TIMEX3": tid = node.attributes['tid'].value type = node.attributes['type'].value time = TimeX3(tid=tid, sentence=None, pos_in_sentence=None) time.idx_in_doc = len(full_text) times[tid] = time return time def get_instances(self, instance_elts, event_instances, events, input_file): for instance in instance_elts: eiid = instance.attributes["eiid"].value eventID = instance.attributes["eventID"].value tense = instance.attributes["tense"].value aspect = instance.attributes["aspect"].value polarity = instance.attributes["polarity"].value pos = instance.attributes["pos"].value if eventID not in events: print(eventID, input_file) continue event = events[eventID] sentence = event.sentence pos_in_sentence = event.pos_in_sentence instance = EventInstance(eiid, event, tense, aspect, polarity, pos, sentence, pos_in_sentence) event_instances[eiid] = instance def parse_node(self, root, events, times, full_text): # print(full_text) for node in root.childNodes: if node.nodeType == node.TEXT_NODE and not node.data.isspace(): text = re.sub(r"\n+", " ", node.data) text = re.sub(r"_", "", node.data) text = re.sub(r"&UR;", "", node.data) text = re.sub(r"&LR;", "", node.data) split_space = text.split() full_text += split_space elif node.nodeName == "TEXT": self.parse_node(node, events, times, full_text) else: el = self.process_node(node, events, times, full_text) text = self.get_text_from_element(node) if el: el.text = text.strip() full_text += text.split() def get_full_text_to_sentences(self, full_text, sentences): split_sentences = [s.split() for s in sentences] def next_position(split_sentences, sent_num, sent_idx): cur_sent = split_sentences[sent_num] if sent_idx < len(cur_sent) - 1: sent_idx += 1 else: sent_idx = 0 sent_num += 1 if sent_num < len(split_sentences): cur_sent = split_sentences[sent_num] return sent_num, sent_idx split_sentences = [s.split() for s in sentences] full_text_to_sentences = [] sent_num = 0 sent_idx = 0 for i, tok in enumerate(full_text): sent_tok = split_sentences[sent_num][sent_idx] # print(tok, sent_tok) assert tok.startswith( sent_tok), str(i) + " " + tok + " " + sent_tok + "\n" + str( split_sentences[sent_num]) full_text_to_sentences.append(tuple([sent_num, sent_idx])) while len(tok) > len(sent_tok): tok = tok[len(sent_tok):] sent_num, sent_idx = next_position(split_sentences, sent_num, sent_idx) sent_tok = split_sentences[sent_num][sent_idx] # print("WHILE", tok, sent_tok) assert tok.startswith(sent_tok), str( i) + " " + tok + " " + sent_tok + "\n" + str( split_sentences[sent_num]) # print(tok) sent_num, sent_idx = next_position(split_sentences, sent_num, sent_idx) return full_text_to_sentences def convert_doc_idx_to_sentences(self, sentences, full_text_to_sentences, its): for key, obj in its.items(): idx = obj.idx_in_doc sentence, pos_in_sentence = full_text_to_sentences[idx] # print(idx, sentence, pos_in_sentence) text = sentences[sentence].split()[pos_in_sentence] assert text == obj.text.split()[0], text + " " + obj.text obj.sentence = sentence obj.pos_in_sentence = pos_in_sentence def read_file(self, input_file): """ Parameters ---------- input_file: str, path to input file Returns ------- TimeMLFile containing sentences, events, eventInstances, times, and tlinks. """ doc = dom.parse(input_file) root = doc.childNodes[0] events = {} times = {} full_text = [] self.parse_node(root, events, times, full_text) # print(full_text) sentences = self.sentence_splitter.split_sentences(" ".join(full_text)) full_text_to_sentences = self.get_full_text_to_sentences( full_text, sentences) self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences, events) self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences, times) event_instances = {} instanceElts = root.getElementsByTagName("MAKEINSTANCE") self.get_instances(instanceElts, event_instances, events, input_file) tlinks = [] tlinkElts = root.getElementsByTagName("TLINK") for tlinkElt in tlinkElts: if tlinkElt.hasAttribute("relatedToEventInstance") and \ tlinkElt.hasAttribute("eventInstanceID"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value eiid = tlinkElt.attributes["eventInstanceID"].value relatedToEventInstance = tlinkElt.attributes[ "relatedToEventInstance"].value if eiid not in event_instances or relatedToEventInstance not in event_instances: continue tlink = Tlink(lid, relType, event_instances[eiid], event_instances[relatedToEventInstance]) tlinks.append(tlink) if tlinkElt.hasAttribute("eventInstanceID") and \ tlinkElt.hasAttribute("relatedToTime"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value eiid = tlinkElt.attributes["eventInstanceID"].value relatedToTime = tlinkElt.attributes["relatedToTime"].value if eiid not in event_instances or relatedToTime not in times: continue tlink = Tlink(lid, relType, event_instances[eiid], times[relatedToTime]) tlinks.append(tlink) if tlinkElt.hasAttribute("timeID") and \ tlinkElt.hasAttribute("relatedToEventInstance"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value tid = tlinkElt.attributes["timeID"].value eiid = tlinkElt.attributes["relatedToEventInstance"].value if tid not in times or eiid not in event_instances: continue tlink = Tlink(lid, relType, times[tid], event_instances[eiid]) tlinks.append(tlink) if tlinkElt.hasAttribute("timeID") and \ tlinkElt.hasAttribute("relatedToTime"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value tid = tlinkElt.attributes["timeID"].value relatedToTime = tlinkElt.attributes["relatedToTime"].value if tid not in times or relatedToTime not in times: continue tlink = Tlink(lid, relType, times[tid], times[relatedToTime]) tlinks.append(tlink) return TimeMLFile(sentences, events, event_instances, times, tlinks, input_file) def read_examples(self, input_file): file_data = self.read_file(input_file) examples = [] for tlink in file_data.tlinks: #print(tlink.lid, tlink.relType, tlink.e1, tlink.e2) sent1 = tlink.e1.sentence sent2 = tlink.e2.sentence #print(sent1, sent2) example = None if sent1 >= len(file_data.sentences) or sent2 >= len( file_data.sentences): continue if sent1 == sent2: text = file_data.sentences[sent1] example = TimeMLExample(text, tlink.e1.pos_in_sentence, tlink.e2.pos_in_sentence, tlink.relType) elif sent1 < sent2: sents = file_data.sentences[sent1:sent2 + 1] text = " [SEP] ".join(sents) e1_pos = tlink.e1.pos_in_sentence e2_pos = sum([len(s.split()) + 1 for s in sents[:-1]]) + tlink.e2.pos_in_sentence example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType) elif sent1 > sent2: sents = file_data.sentences[sent2:sent1 + 1] text = " [SEP] ".join(sents) e1_pos = sum([len(s.split()) + 1 for s in sents[:-1]]) + tlink.e1.pos_in_sentence e2_pos = tlink.e2.pos_in_sentence example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType) if example: examples.append(example) #print(example) return examples def antithetics(self, all_examples): new_exs = [] for ex in all_examples: new_ex = None if ex.str_label == "AFTER": new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "BEFORE") new_ex.int_label = self.label_list.index("BEFORE") new_exs.append(new_ex) if ex.str_label == "BEFORE": new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "AFTER") new_ex.int_label = self.label_list.index("AFTER") new_exs.append(new_ex) if ex.str_label == "DURING": new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "DURING") new_ex.int_label = self.label_list.index("DURING") new_exs.append(new_ex) if new_ex != None: new_ex.sentences = ex.sentences new_ex.e1_sentence_num = ex.e2_sentence_num new_ex.e1_sentence_pos = ex.e2_sentence_pos new_ex.e2_sentence_num = ex.e1_sentence_num new_ex.e2_sentence_pos = ex.e1_sentence_pos all_examples.extend(new_exs) def assign_num_labels(self, all_examples): if not self.label_list: labels = set() for ex in all_examples: labels.add(ex.str_label) labels = list(labels) labels.sort() print(labels) print(len(labels)) self.label_list = labels for ex in all_examples: ex.int_label = self.label_list.index(ex.str_label) def read_examples_from_directory(self, dir_path): #os.chdir(dir_path) examples_list = [] for file in glob.glob(dir_path + "*.tml"): #file_path = dir_path + file examples = self.read_examples(file) examples_list.append(examples) all_examples = list(itertools.chain.from_iterable(examples_list)) #antithetics(all_examples) print(len(all_examples)) self.assign_num_labels(all_examples) return all_examples def read_example_files(self, dir_path): all_files = glob.glob(dir_path + "*.tml") train_files = all_files[:-4] dev_files = all_files[-4:] train_examples_list = [] for file in train_files: examples = self.read_examples(file) train_examples_list.append(examples) train = list(itertools.chain.from_iterable(train_examples_list)) dev_examples_list = [] for file in dev_files: examples = self.read_examples(file) dev_examples_list.append(examples) dev = list(itertools.chain.from_iterable(dev_examples_list)) self.assign_num_labels(train + dev) return train, dev def read_dense_examples(self, td_path, extra=False, window_size=None): class DenseExample(object): def __init__(self, file_name, e1, e2, label): self.file_name = file_name self.e1 = e1 self.e2 = e2 self.label = self.parse_label(label) def parse_label(self, label): labels = { "a": "AFTER", "b": "BEFORE", "i": "INCLUDES", "ii": "IS_INCLUDED", "s": "SIMULTANEOUS", "v": "VAGUE" } return labels[label] DEV_DOCS = { "APW19980227.0487", "CNN19980223.1130.0960", "NYT19980212.0019", "PRI19980216.2000.0170", "ed980111.1130.0089" } TEST_DOCS = { "APW19980227.0489", "APW19980227.0494", "APW19980308.0201", "APW19980418.0210", "CNN19980126.1600.1104", "CNN19980213.2130.0155", "NYT19980402.0453", "PRI19980115.2000.0186", "PRI19980306.2000.1675" } files_to_exs = {} f = open(td_path, "r") for line in f.readlines(): split = line.split() ex = DenseExample(split[0], split[1], split[2], split[3]) if ex.file_name not in files_to_exs: files_to_exs[ex.file_name] = [ex] else: files_to_exs[ex.file_name].append(ex) files = set(files_to_exs.keys()) train_files = files - DEV_DOCS - TEST_DOCS dev_files = DEV_DOCS train_examples = [] for file_name in train_files: file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \ if extra \ else self.read_file(FILE_DIR + "/" + file_name + ".tml") for ex in files_to_exs[file_name]: e1 = file.get_element(ex.e1) e2 = file.get_element(ex.e2) if e1 == None or e2 == None: #print("oops", file_name, ex.e1, ex.e2) continue example = file.get_example(e1, e2, ex.label, window_size) if not example: print("o no") else: train_examples.append(example) self.assign_num_labels(train_examples) dev_examples = [] for file_name in dev_files: file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \ if extra \ else self.read_file(FILE_DIR + "/" + file_name + ".tml") for ex in files_to_exs[file_name]: e1 = file.get_element(ex.e1) e2 = file.get_element(ex.e2) if e1 == None or e2 == None: #print("oops", file_name, ex.e1, ex.e2) continue example = file.get_example(e1, e2, ex.label, window_size) if not example: print("o no") else: dev_examples.append(example) self.assign_num_labels(dev_examples) return train_examples, dev_examples def read_dense_test_examples(self, td_path, extra=False, window_size=None): class DenseExample(object): def __init__(self, file_name, e1, e2, label): self.file_name = file_name self.e1 = e1 self.e2 = e2 self.label = self.parse_label(label) def parse_label(self, label): labels = { "a": "AFTER", "b": "BEFORE", "i": "INCLUDES", "ii": "IS_INCLUDED", "s": "SIMULTANEOUS", "v": "VAGUE" } return labels[label] TEST_DOCS = { "APW19980227.0489", "APW19980227.0494", "APW19980308.0201", "APW19980418.0210", "CNN19980126.1600.1104", "CNN19980213.2130.0155", "NYT19980402.0453", "PRI19980115.2000.0186", "PRI19980306.2000.1675" } files_to_exs = {} f = open(td_path, "r") for line in f.readlines(): split = line.split() ex = DenseExample(split[0], split[1], split[2], split[3]) if ex.file_name not in files_to_exs: files_to_exs[ex.file_name] = [ex] else: files_to_exs[ex.file_name].append(ex) test_examples = [] for file_name in TEST_DOCS: file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \ if extra \ else self.read_file(FILE_DIR + "/" + file_name + ".tml") for ex in files_to_exs[file_name]: e1 = file.get_element(ex.e1) e2 = file.get_element(ex.e2) if e1 == None or e2 == None: #print("oops", file_name, ex.e1, ex.e2) continue example = file.get_example(e1, e2, ex.label, window_size) if not example: print("o no") else: test_examples.append(example) self.assign_num_labels(test_examples) return test_examples def read_tempeval3_examples(): return None, None
def setUp(self): super(TestSentenceSplitter, self).setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)
for named_entity_category, words in aggregated_named_entities.items(): word_counts = Counter(words) total_sum = sum(word_counts.values()) word_frequencies = [(word, float(frequency) / total_sum) for word, frequency in word_counts.items()] named_entities_frequency_table[ named_entity_category] = word_frequencies with open(filename, 'w') as f: json.dump(named_entities_frequency_table, f) instances = create_nabert_reader( data_path='../../data/drop_dataset/drop_dataset_train.json') ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018() sentences_splitter = SpacySentenceSplitter() named_entities = defaultdict(list) with torch.no_grad(): for instance_idx, instance in enumerate(instances): original_question = instance.fields['metadata'].metadata[ 'original_question'] original_passage = instance.fields['metadata'].metadata[ 'original_passage'] aggregate_named_entities(original_question, named_entities) # NER tagger is more accurate when single sentences are fed as input passage_sentences = sentences_splitter.split_sentences( original_passage) for passage_sentence in passage_sentences:
class TestSentenceSplitter(AllenNlpTestCase): def setup_method(self): super().setup_method() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_dep_parse_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [ self.rule_based_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [ self.dep_parse_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_to_params(self): params = self.dep_parse_splitter.to_params() assert isinstance(params, Params) assert params.params == { "type": "spacy", "language": self.dep_parse_splitter._language, "rule_based": self.dep_parse_splitter._rule_based, }
number = word_to_num(word) except ValueError: try: number = int(word) except ValueError: try: number = float(word) except ValueError: number = None return number tokenizer = BertDropTokenizer(pretrained_model="bert-base-uncased") number_tokenizer = WordTokenizer() words_splitter = WordTokenizer() sentences_splitter = SpacySentenceSplitter() ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018() pos_tagger = span_based_constituency_parsing_with_elmo_joshi_2018() def extract_letters_frequency(passage, sentence_idx=None): """ :param passage: :param sentence_idx: None for whole passage, else per sentence (index 0).. :return: """ if sentence_idx is None: return dict(filter(lambda k: k[0].isalpha(), Counter(passage).items())) else: sentences = extract_sentences(passage) sen = sentences[sentence_idx]
async def create_dataset_db(dataset_path: str, db_discriminator: str, file_path: str, use_existing_database=True, sentence_splitter: SentenceSplitter = SpacySentenceSplitter(), should_save_sentiment: bool = True, ner_model: str = True, coreference_model: str = True, batch_size: int = 100, max_workers: int = 16, marked_sentences=False, cuda_device: Union[List[int], int] = None) -> str: file_name = os.path.basename(file_path) database_file = f"{dataset_path}/{file_name}_{db_discriminator}.db" dataset_db = f"sqlite:///{database_file}" logging.info(f"Cached dataset path: {dataset_db}") # Create dir try: os.makedirs(dataset_path) except OSError: pass # Remove database if it shouldn't be reused. if not use_existing_database: try: os.remove(database_file) except OSError: pass if not Path(database_file).is_file(): loop = asyncio.get_event_loop() with dataset.connect(dataset_db, engine_kwargs=engine_kwargs) as db: # Create the main tables and columns that need indexing. story_table = db.create_table('story') story_table.create_column('story_num', db.types.integer) story_table.create_index(['story_num']) sentence_table = db.create_table('sentence') sentence_table.create_column('story_id', db.types.bigint) sentence_table.create_column('sentence_num', db.types.integer) sentence_table.create_column('sentence_len', db.types.integer) sentence_table.create_column('start_span', db.types.integer) sentence_table.create_column('end_span', db.types.integer) # Indices created at the beginning as creating them later causing other processes to fail # when the a large index is locking the database. sentence_table.create_index(['story_id']) sentence_table.create_index(['start_span']) sentence_table.create_index(['end_span']) db.query("PRAGMA journal_mode=WAL;") create_story_tasks = [] with ProcessPoolExecutor(max_workers=max_workers) as executor: async for lines, story_nums in chunk_stories_from_file(file_path, batch_size=batch_size): story_ids = [id for id in list(db['story'].insert(dict(story_num=story_num)) for story_num in story_nums)] if marked_sentences: sentence_splitter = MarkerSentenceSplitter() create_story_tasks.append( loop.run_in_executor(executor, ProcessStory(sentence_splitter), lines, story_ids)) for i, t in enumerate(asyncio.as_completed(create_story_tasks)): story_ids, sentences_to_save, story_metrics = await t db["sentence"].insert_many(sentences_to_save) for m in story_metrics: db["story"].update(m, ['id']) print(f"Batch {i} - stories text saved: {story_ids}") logger.info(f"Saved stories to db with ids: {story_ids}") await save_language_features(batch_size, dataset_db, executor, loop) if should_save_sentiment: await save_sentiment(batch_size, dataset_db, executor, loop) if ner_model: await save_ner(ner_model, batch_size, dataset_db, cuda_device=cuda_device) if coreference_model: await save_coreferences(coreference_model, dataset_db, cuda_device=cuda_device) return dataset_db
class TextCatReader(DatasetReader): """ Reads tokens and their labels from a labeled text classification dataset. Expects a "tokens" field and a "category" field in JSON format. The output of ``read`` is a list of ``Instance`` s with the fields: tokens: ``TextField`` and label: ``LabelField`` Parameters ---------- token_indexers : ``Dict[str, TokenIndexer]``, optional optional (default=``{"tokens": SingleIdTokenIndexer()}``) We use this to define the input representation for the text. See :class:`TokenIndexer`. tokenizer : ``Tokenizer``, optional (default = ``{"tokens": WordTokenizer()}``) Tokenizer to use to split the input text into words or other kinds of tokens. segment_sentences: ``bool``, optional (default = ``False``) If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like the Hierarchical Attention Network. sequence_length: ``int``, optional (default = ``None``) If specified, will truncate tokens to specified maximum length. ignore_labels: ``bool``, optional (default = ``False``) If specified, will ignore labels when reading data, useful for semi-supervised textcat skip_label_indexing: ``bool``, optional (default = ``False``) Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0. lazy : ``bool``, optional, (default = ``False``) Whether or not instances can be read lazily. """ def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, unrestricted_tokenizer: Tokenizer = None, segment_sentences: bool = False, sequence_length: int = None, ignore_labels: bool = False, skip_label_indexing: bool = False, sample: int = None, unlabeled_data_path: str = None, lazy: bool = False) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or WordTokenizer() self._unrestricted_tokenizer = unrestricted_tokenizer self._sample = sample self._segment_sentences = segment_sentences self._sequence_length = sequence_length self._ignore_labels = ignore_labels self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._unlabeled_data_path = unlabeled_data_path if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() def _reservoir_sampling(self, file_): """ reservoir sampling for reading random lines from file without loading entire file into memory See here for explanation of algorithm: https://stackoverflow.com/questions/35680236/select-100-random-lines-from-a-file-with-a-1-million-which-cant-be-read-into-me Parameters ---------- file : `str` - file path sample_size : `int` - size of random sample you want Returns ------- result : `List[str]` - sample lines of file """ file_iterator = iter(file_) try: result = [next(file_iterator) for _ in range(self._sample)] except StopIteration: raise ValueError("Sample larger than population") for index, item in enumerate(file_iterator, start=self._sample): sample_index = np.random.randint(0, index) if sample_index < self._sample: result[sample_index] = item np.random.shuffle(result) return result @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: if self._sample is not None: lines = [(item, False) for item in self._reservoir_sampling(data_file)] else: lines = [(item, True) for item in data_file.readlines()] if self._unlabeled_data_path: with open(cached_path(self._unlabeled_data_path)) as data_file: lines += [(item, False) for item in data_file.readlines()] for line, is_labeled in lines: items = json.loads(line) text = items["tokens"] label = str(items['category']) instance = self.text_to_instance(text=text, label=label, is_labeled=is_labeled) if instance is not None: yield instance def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._sequence_length: tokens = tokens[:self._sequence_length] return tokens @overrides def text_to_instance(self, text: str, label: str = None, is_labeled: bool = False) -> Instance: # type: ignore """ Parameters ---------- text : ``str``, required. The text to classify label ``str``, optional, (default = None). The label for this text. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label label of the sentence or phrase. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens, self._token_indexers)) fields['tokens'] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._sequence_length is not None: tokens = self._truncate(tokens) fields['tokens'] = TextField(tokens, self._token_indexers) if self._unrestricted_tokenizer: unrestricted_tokens = self._unrestricted_tokenizer.tokenize( text) if self._sequence_length is not None: unrestricted_tokens = self._truncate(unrestricted_tokens) fields['filtered_tokens'] = TextField(unrestricted_tokens, self._token_indexers) # TODO: Document 'default' unsupervised label as pre-condition. if label is not None: fields['label'] = LabelField( label, skip_indexing=self._skip_label_indexing) fields['metadata'] = MetadataField({"is_labeled": is_labeled}) return Instance(fields)
class ICCDatasetReader(DatasetReader): def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = DummyTokenizer() # assumes our tokens unchanged self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() @overrides def _read(self, file_path): with open(cached_path(file_path)) as data_file: for line in data_file: try: text, label = line.strip().split("\t") except ValueError as e: continue instance = self.text_to_instance(text=text, label=label) if instance is not None: yield instance def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._max_sequence_length: tokens = tokens[:self._max_sequence_length] return tokens @overrides def text_to_instance(self, text: str, label: Union[str, int] = None) -> Instance: fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._max_sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens, self._token_indexers)) fields["tokens"] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields["tokens"] = TextField(tokens, self._token_indexers) if label is not None: fields["label"] = LabelField( label, skip_indexing=self._skip_label_indexing) return Instance(fields)
def setup_method(self): super().setup_method() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)
class IssueReaderSiamese(DatasetReader): """ Parameters ---------- lazy : ``bool`` (optional, default=False) Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will take longer per batch. This also allows training with datasets that are too large to fit in memory. tokenizer : ``Tokenizer``, optional Tokenizer to use to split the sentence into words or other kinds of tokens. Defaults to ``WordTokenizer()``. token_indexers : ``Dict[str, TokenIndexer]``, optional Indexers used to define input token representations. Defaults to ``{"tokens": SingleIdTokenIndexer()}``. """ def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, segment_sentences: bool = True, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( word_splitter=SpacyWordSplitter(pos_tags=True), word_stemmer=PorterStemmer()) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if segment_sentences: self._segment_sentences = SpacySentenceSplitter() self._class_cnt = defaultdict(int) def read_dataset(self, file_path): features = [] others = [] with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file: if not line or len(line) == 0: continue line = json.loads(line) if "id" not in line.keys(): d_id = "" else: d_id = line['id'] report = split_issue_template(line['body']) report = self._segment_sentences.split_sentences(report) cmts = line['comments'] comments = [] for comment in cmts: user_name = comment['user'] comment = replace_tokens(comment['body']) if len(comment) == 0: continue comments.append((user_name, comment)) dialog = report + comments if len(dialog) == 0: continue labels = line['label'] if len(labels) == 0: label = None else: label = "feature" if "feature" in labels or "type: feature" in labels else "other" if "feature" == label: features.append((d_id, dialog, label)) else: others.append((d_id, dialog, label)) return features, others @overrides def _read(self, file_path): features, others = self.read_dataset(file_path) all_data = features + others random.shuffle(all_data) same_num = 0 diff_num = 0 if "unlabel" in file_path: logger.info("Begin predict------") features, others = self.read_dataset( "frmodel/data/{}_target_train.txt") for sample in features + others: yield self.text_to_instance((sample, sample), is_gold=True) for sample in all_data: yield self.text_to_instance((sample, sample)) logger.info(f"Predict sample num is {len(all_data)}") else: logger.info("Begin training-------") iter_num = 1 if "test" in file_path: features, others = self.read_dataset( re.sub("test", "train", file_path)) iter_num = 1 for _ in range(iter_num): # plain balance data if "train" in file_path: for k in range(len(others) - len(features)): all_data.append(random.choice(features)) for sample in all_data: positive = random.choice(features) negative = random.choice(others) yield self.text_to_instance((sample, positive)) yield self.text_to_instance((sample, negative)) same_num += 1 diff_num += 1 logger.info( f"Dataset Count: Same : {same_num} / Diff : {diff_num}") @overrides def text_to_instance(self, p, is_gold=False) -> Instance: # type: ignore fields: Dict[str, Field] = {} ins1, ins2 = p dialog = ListField([ TextField([word for word in self._tokenizer.tokenize(line[1])], self._token_indexers) for line in ins1[1] ]) fields['dialog1'] = dialog fields["pos_tags1"] = ListField([ SequenceLabelField( [word.tag_ for word in self._tokenizer.tokenize(line[1])], tokens, label_namespace="pos") for line, tokens in zip(ins1[1], dialog) ]) if ins1[-1] is not None and ins2[-1] is not None: if ins1[-1] == ins2[-1]: fields['label'] = LabelField("same") else: fields['label'] = LabelField("diff") fields['label_tags'] = LabelField("@".join([ins1[-1], ins2[-1]]), label_namespace="label_tags") fields['label'] = LabelField(ins1[-1]) fields['metadata'] = MetadataField({ "is_gold": is_gold, "pair_instance": p }) return Instance(fields)
def setUp(self): super(TestSentenceSplitter, self).setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)
def __init__(self): self.label_list = None self.sentence_splitter = SpacySentenceSplitter()
class TextClassificationJsonReader(DatasetReader): """ Reads tokens and their labels from a labeled text classification dataset. Expects a "text" field and a "label" field in JSON format. The output of ``read`` is a list of ``Instance`` s with the fields: tokens : ``TextField`` and label : ``LabelField`` # Parameters token_indexers : ``Dict[str, TokenIndexer]``, optional optional (default=``{"tokens": SingleIdTokenIndexer()}``) We use this to define the input representation for the text. See :class:`TokenIndexer`. tokenizer : ``Tokenizer``, optional (default = ``{"tokens": SpacyTokenizer()}``) Tokenizer to use to split the input text into words or other kinds of tokens. segment_sentences : ``bool``, optional (default = ``False``) If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like the Hierarchical Attention Network (https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf). max_sequence_length : ``int``, optional (default = ``None``) If specified, will truncate tokens to specified maximum length. skip_label_indexing : ``bool``, optional (default = ``False``) Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0. lazy : ``bool``, optional, (default = ``False``) Whether or not instances can be read lazily. """ def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in data_file.readlines(): if not line: continue items = json.loads(line) text = items["text"] label = items.get("label", None) if label is not None: if self._skip_label_indexing: try: label = int(label) except ValueError: raise ValueError( "Labels must be integers if skip_label_indexing is True." ) else: label = str(label) instance = self.text_to_instance(text=text, label=label) if instance is not None: yield instance def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._max_sequence_length: tokens = tokens[:self._max_sequence_length] return tokens @overrides def text_to_instance( self, text: str, label: Union[str, int] = None) -> Instance: # type: ignore """ # Parameters text : ``str``, required. The text to classify label : ``str``, optional, (default = None). The label for this text. # Returns An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label label of the sentence or phrase. """ fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._max_sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens, self._token_indexers)) fields["tokens"] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields["tokens"] = TextField(tokens, self._token_indexers) if label is not None: fields["label"] = LabelField( label, skip_indexing=self._skip_label_indexing) return Instance(fields)
class TextClassificationJsonReader(DatasetReader): """ Reads tokens and their labels from a labeled text classification dataset. The output of `read` is a list of `Instance` s with the fields: tokens : `TextField` and label : `LabelField` Registered as a `DatasetReader` with name "text_classification_json". [0]: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf # Parameters token_indexers : `Dict[str, TokenIndexer]`, optional optional (default=`{"tokens": SingleIdTokenIndexer()}`) We use this to define the input representation for the text. See :class:`TokenIndexer`. tokenizer : `Tokenizer`, optional (default = `{"tokens": SpacyTokenizer()}`) Tokenizer to use to split the input text into words or other kinds of tokens. segment_sentences : `bool`, optional (default = `False`) If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like [the Hierarchical Attention Network][0]. max_sequence_length : `int`, optional (default = `None`) If specified, will truncate tokens to specified maximum length. skip_label_indexing : `bool`, optional (default = `False`) Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0. text_key: `str`, optional (default=`"text"`) The key name of the source field in the JSON data file. label_key: `str`, optional (default=`"label"`) The key name of the target field in the JSON data file. """ def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, text_key: str = "text", label_key: str = "label", **kwargs, ) -> None: super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._text_key = text_key self._label_key = label_key if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in self.shard_iterable(data_file.readlines()): if not line: continue items = json.loads(line) text = items[self._text_key] label = items.get(self._label_key) if label is not None: if self._skip_label_indexing: try: label = int(label) except ValueError: raise ValueError( "Labels must be integers if skip_label_indexing is True." ) else: label = str(label) yield self.text_to_instance(text=text, label=label) def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._max_sequence_length: tokens = tokens[:self._max_sequence_length] return tokens @overrides def text_to_instance( self, text: str, label: Union[str, int] = None) -> Instance: # type: ignore """ # Parameters text : `str`, required. The text to classify label : `str`, optional, (default = `None`). The label for this text. # Returns An `Instance` containing the following fields: - tokens (`TextField`) : The tokens in the sentence or phrase. - label (`LabelField`) : The label label of the sentence or phrase. """ fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._max_sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens)) fields["tokens"] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields["tokens"] = TextField(tokens) if label is not None: fields["label"] = LabelField( label, skip_indexing=self._skip_label_indexing) return Instance(fields) @overrides def apply_token_indexers(self, instance: Instance) -> None: if self._segment_sentences: for text_field in instance.fields["tokens"]: # type: ignore text_field._token_indexers = self._token_indexers else: instance.fields[ "tokens"]._token_indexers = self._token_indexers # type: ignore