def _parse_token(self, line: str) -> Token: fields: List[str] = re.split(self.column_delimiter, line) token = Token(fields[self.text_column]) for column in self.column_name_map: if len(fields) > column: if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY: token.add_label( self.column_name_map[column], fields[column] ) if self.column_name_map[column] == self.SPACE_AFTER_KEY and fields[column] == '-': token.whitespace_after = False return token
def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence: sentence: Sentence = Sentence() # current token ID token_idx = 0 for conllu_token in token_list: token = Token(conllu_token["form"]) if "ner" in conllu_token: token.add_label("ner", conllu_token["ner"]) if "ner-2" in conllu_token: token.add_label("ner-2", conllu_token["ner-2"]) if "lemma" in conllu_token: token.add_label("lemma", conllu_token["lemma"]) if "misc" in conllu_token and conllu_token["misc"] is not None: space_after = conllu_token["misc"].get("SpaceAfter") if space_after == "No": token.whitespace_after = False sentence.add_token(token) token_idx += 1 if "sentence_id" in token_list.metadata: sentence.add_label("sentence_id", token_list.metadata["sentence_id"]) if "relations" in token_list.metadata: # relations: List[Relation] = [] for head_start, head_end, tail_start, tail_end, label in token_list.metadata["relations"]: # head and tail span indices are 1-indexed and end index is inclusive head = Span(sentence.tokens[head_start - 1 : head_end]) tail = Span(sentence.tokens[tail_start - 1 : tail_end]) sentence.add_complex_label("relation", RelationLabel(value=label, head=head, tail=tail)) # determine all NER label types in sentence and add all NER spans as sentence-level labels ner_label_types = [] for token in sentence.tokens: for annotation in token.annotation_layers.keys(): if annotation.startswith("ner") and annotation not in ner_label_types: ner_label_types.append(annotation) for label_type in ner_label_types: spans = sentence.get_spans(label_type) for span in spans: sentence.add_complex_label("entity", label=SpanLabel(span=span, value=span.tag, score=span.score)) return sentence
def __getitem__(self, index: int = 0) -> Sentence: if self.in_memory: sentence = self.sentences[index] else: with open(str(self.path_to_conll_file), encoding="utf-8") as file: file.seek(self.indices[index]) line = file.readline() sentence: Sentence = Sentence() while line: line = line.strip() fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: break elif line.startswith("#"): line = file.readline() continue elif "." in fields[0]: line = file.readline() continue elif "-" in fields[0]: line = file.readline() continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_label("lemma", str(fields[2])) token.add_label("upos", str(fields[3])) token.add_label("pos", str(fields[4])) token.add_label("dependency", str(fields[7])) if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: token.whitespace_after = False for morph in str(fields[5]).split("|"): if "=" not in morph: continue token.add_label( morph.split("=")[0].lower(), morph.split("=")[1] ) if len(fields) > 10 and str(fields[10]) == "Y": token.add_label("frame", str(fields[11])) sentence.add_token(token) line = file.readline() return sentence
def __init__(self, path_to_conll_file: Union[str, Path], in_memory: bool = True): """ Instantiates a column dataset in CoNLL-U format. :param path_to_conll_file: Path to the CoNLL-U formatted file :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads """ if type(path_to_conll_file) is str: path_to_conll_file = Path(path_to_conll_file) assert path_to_conll_file.exists() self.in_memory = in_memory self.path_to_conll_file = path_to_conll_file self.total_sentence_count: int = 0 if self.in_memory: self.sentences: List[Sentence] = [] else: self.indices: List[int] = [] with open(str(self.path_to_conll_file), encoding="utf-8") as file: line = file.readline() position = 0 sentence: Sentence = Sentence() while line: line = line.strip() fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position) position = file.tell() sentence: Sentence = Sentence() elif line.startswith("#"): line = file.readline() continue elif "." in fields[0]: line = file.readline() continue elif "-" in fields[0]: line = file.readline() continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_label("lemma", str(fields[2])) token.add_label("upos", str(fields[3])) token.add_label("pos", str(fields[4])) token.add_label("dependency", str(fields[7])) if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: token.whitespace_after = False for morph in str(fields[5]).split("|"): if "=" not in morph: continue token.add_label(morph.split("=")[0].lower(), morph.split("=")[1]) if len(fields) > 10 and str(fields[10]) == "Y": token.add_label("frame", str(fields[11])) sentence.add_token(token) line = file.readline() if len(sentence.tokens) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position)
def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence: sentence: Sentence = Sentence() # Build the sentence tokens and add the annotations. for conllu_token in token_list: token = Token(conllu_token["form"]) for field in self.token_annotation_fields: field_value: Any = conllu_token[field] if isinstance(field_value, dict): # For fields that contain key-value annotations, # we add the key as label type-name and the value as the label value. for key, value in field_value.items(): token.add_label(typename=key, value=str(value)) else: token.add_label(typename=field, value=str(field_value)) if conllu_token.get("misc") is not None: space_after: Optional[str] = conllu_token["misc"].get( "SpaceAfter") if space_after == "No": token.whitespace_after = False sentence.add_token(token) if "sentence_id" in token_list.metadata: sentence.add_label("sentence_id", token_list.metadata["sentence_id"]) if "relations" in token_list.metadata: for ( head_start, head_end, tail_start, tail_end, label, ) in token_list.metadata["relations"]: # head and tail span indices are 1-indexed and end index is inclusive head = Span(sentence.tokens[head_start - 1:head_end]) tail = Span(sentence.tokens[tail_start - 1:tail_end]) sentence.add_complex_label( "relation", RelationLabel(value=label, head=head, tail=tail)) # determine all NER label types in sentence and add all NER spans as sentence-level labels ner_label_types = [] for token in sentence.tokens: for annotation in token.annotation_layers.keys(): if annotation.startswith( "ner") and annotation not in ner_label_types: ner_label_types.append(annotation) for label_type in ner_label_types: spans = sentence.get_spans(label_type) for span in spans: sentence.add_complex_label( "entity", label=SpanLabel(span=span, value=span.tag, score=span.score), ) return sentence