def test_parse_CoNLL2009_2(self): data = dedent("""\ #\tid='1'-document_id='36:1047'-span='1' 1\t+\t+\tPunc\tPunc\t_\t0\tROOT\t_\t_ 2\tIn\tin\tr\tr\tr|-|-|-|-|-|-|-|-\t5\tAuxP\t_\t_ 3\tDei\tDeus\tn\tPropn\tn|-|s|-|-|-|m|g|-\t4\tATR\t_\t_ 4\tnomine\tnomen\tn\tn\tn|-|s|-|-|-|n|b|-\t2\tADV\t_\t_ 5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_ """) sentences = parse( data, fields=( 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc' ), field_parsers={ "feats": lambda line, i: [feat for feat in line[i].split("|")] } ) self.assertEqual( sentences[0][4], OrderedDict([ ('id', 5), ('form', 'regnante'), ('lemma', 'regno'), ('upostag', 't'), ('xpostag', 't'), ('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']), ('head', 0), ('deprel', 'ADV'), ('deps', None), ('misc', None), ]) ) self.assertEqual( sentences[0].metadata, OrderedDict([ ('id', "'1'-document_id='36:1047'-span='1'") ]) )
def __parse_post_lev(files: List[str]): result_docs = [] for fname in files: with open(fname, 'r') as content_file: content = content_file.read() chunks = conllu.parse(text=content, ) assert len(chunks) == 1 token_list, bio_list, tag_list = [], [], [] for td in chunks[0]: bio_tags = td['xpostag'].split('-') tag = DataClass.ConllTag.NONE biostr = bio_tags[0] if biostr != 'O': if 'Fact' in bio_tags[1]: tag = DataClass.ConllTag.Fact elif 'Testimony' in bio_tags[1]: tag = DataClass.ConllTag.Testimony elif 'Value' in bio_tags[1]: tag = DataClass.ConllTag.Value elif 'Rhetorical' in bio_tags[1]: tag = DataClass.ConllTag.Rhetorical elif 'Policy' in bio_tags[1]: tag = DataClass.ConllTag.Policy else: assert False, 'Invalid proposition type: {}'.format( bio_tags[1]) assert biostr in ['B', 'I', 'O'] token = td['form'] token_list.append(token) bio_list.append(biostr) tag_list.append(tag) assert len(token_list) == len(bio_list) == len(tag_list) conll_doc = DataClass.ConllDocument(sentence_id=fname, token_list=token_list, bio_list=bio_list, tag_list=tag_list) result_docs.append(conll_doc) if len(result_docs) % 100 == 0: util.print_info('loaded chunks: {}'.format(len(result_docs))) return result_docs
def parse_tacred_json_to_ud(tacred_json_file_path: str, output_path: str, lang: str): nlp = SampleBARTAnnotator.get_ud_parser(lang) conllu_parse_list = [] with open(tacred_json_file_path, 'r', encoding='utf-8') as input_file: tacred_json = json.load(input_file) for example_json in tqdm(tacred_json): doc = nlp([example_json["token"]]) conllu_token_list = conllu.parse(doc._.conll_str) assert len(conllu_token_list) == 1 conllu_token_list = conllu_token_list[0] relation = example_json['relation'] tokens = [node["form"] for node in conllu_token_list] conllu_token_list.metadata["id"] = example_json['id'] conllu_token_list.metadata["docid"] = example_json['docid'] conllu_token_list.metadata["relation"] = relation conllu_token_list.metadata["token"] = json.dumps(tokens) # Adding +1 because this count start for 0, but the conllu token id is starting from 1 conllu_token_list.metadata["subj_start"] = json.dumps(example_json['subj_start'] + 1) conllu_token_list.metadata["subj_end"] = json.dumps(example_json['subj_end'] + 1) conllu_token_list.metadata["obj_start"] = json.dumps(example_json['obj_start'] + 1) conllu_token_list.metadata["obj_end"] = json.dumps(example_json['obj_end'] + 1) conllu_token_list.metadata["subj_type"] = example_json['subj_type'] conllu_token_list.metadata["obj_type"] = example_json['obj_type'] trigger_tokens = search_triggers(example_json['subj_start'], example_json['subj_end'], example_json['obj_start'], example_json['obj_end'], relation, tokens, lang) # Sorting for ease of read trigger_tokens_sorted = sorted(trigger_tokens, key=lambda x: x[0]) conllu_token_list.metadata["trigger_tokens"] = json.dumps(trigger_tokens_sorted) conllu_parse_list.append(conllu_token_list) with open(output_path, 'w', encoding='utf-8') as output_file: for conllu_token_list in conllu_parse_list: output_file.write(conllu_token_list.serialize())
def generate_hypothesis_reference(input_dir=CONLLU_TEST_DIR): """ This function return a reference and a hypothesis (list) by taking in a .conllu file. Args: input_dir: the position of the input .conllu file. Returns: `reference` contains the right for WER computing. `hypothesis` contains the raw Japanese that will be tokenized by the maxmatch and checked against the `reference` """ source = conllu.parse(open(input_dir, "r").read()) hypothesis = [(reduce(lambda x, y: x + y, [token["form"] for token in tokenlist])) for tokenlist in source] reference = [(reduce(lambda x, y: x + " " + y, [token["form"] for token in tokenlist])) for tokenlist in source] return reference, hypothesis
def get_terms(conllu_data): """Get candidate terms from parsed text. Args: conllu_data (str): Parses in CoNLL-U format Returns: Dict of { term: weight } where term is normalized term text and weight an estimate of term relevance to data. """ sentences = conllu.parse(conllu_data) # Naive example implementation: take lemma counts for nouns and return # counts normalized to [0,1] by dividing by max count. count = Counter() for sentence in sentences: for token in sentence: if token['upostag'] in ('NOUN', 'PROPN'): count[token['lemma']] += 1 max_count = max(count.values()) return {k: v / max_count for k, v in count.items()}
def testComplex2(self): content = [ u'# sent_id = 2011Interviyu_Mariny_Astvatsaturyan.xml_11', u'# text = Тогда, как и сейчас, в качестве внештатного сотрудника.', u'0.1 _ _ _ _ _ _ _ 0:exroot _', u'1 Тогда тогда ADV _ Degree=Pos 10 orphan 0.1:advmod SpaceAfter=No', u'2 , , PUNCT _ _ 5 punct 5:punct _', u'3 как как SCONJ _ _ 5 mark 5:mark _', u'4 и и PART _ _ 5 advmod 5:advmod _', u'5 сейчас сейчас ADV _ Degree=Pos 1 advcl 1:advcl SpaceAfter=No', u'6 , , PUNCT _ _ 5 punct 5:punct _', u'7 в в ADP _ _ 10 case 10:case _', u'8 качестве качество NOUN _ Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing 7 fixed 7:fixed _', u'9 внештатного внештатный ADJ _ Case=Gen|Degree=Pos|Gender=Masc|Number=Sing 10 amod 10:amod _', u'10 сотрудника сотрудник NOUN _ Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing 0 root 0:root SpaceAfter=No', u'11 . . PUNCT _ _ 10 punct 10:punct _' ] sentences = parse('\n'.join(content)) tokens = extract_text(sentences[0]) expected = sentences[0].metadata['text'] self.assertEqual(expected, ''.join(tokens))
def load_ud_corpus(ud_source_dir: str): """ Extracts the underlying UD corpus data that is stored in conllu format. Returns a dictionary where the keys are the split and the values are dictionaries where the keys are the sentenceId """ data_path = os.path.join(ud_source_dir, "UD_English-EWT-r1.2") sent_id_to_text = {} for split in ["train", "dev", "test"]: split_path = os.path.join(data_path, f"en-ud-{split}.conllu") log.info("Loading UD data from %s", split_path) with open(split_path) as fd: data = "".join(line for line in fd) data = conllu.parse(data) sent_count = 0 for sent in data: sent_id_to_text[(split, sent_count)] = " ".join( [item["form"] for item in sent]) sent_count += 1 return sent_id_to_text
def score(self, sentence): sentence_relations = [] if self.udpipe == False: doc = self.nlp(sentence) for token in doc: sentence_relations.append(str(token.dep_)) BOR = self.count_vec.transform([' '.join(sentence_relations) ]).toarray().tolist() # BOR = BOR.tolist() else: sentences = self.udpipe_model.tokenize(sentence) for s in sentences: self.udpipe_model.tag(s) self.udpipe_model.parse(s) conllu_txt = self.udpipe_model.write( sentences, "conllu") #conllu|horizontal|vertical conllu_obj = conllu.parse(conllu_txt) for item in conllu_obj[0]: sentence_relations.append(str(item['deprel'])) BOR = self.count_vec.transform([' '.join(sentence_relations) ]).toarray().tolist() return BOR[0]
def get_tree(files, out): for file in files: with open(args.fp + file, "r", encoding="utf-8") as infile, open( out, "a", encoding="utf-8") as outfile: data = infile.read() items = [item for item in data.split("\n\n")] if len(items) > 5000: threshold = 5000 else: threshold = len(items) for item in items[:threshold]: try: parsed = parse(item) item_postags = [] if parsed: for item in parsed[0]: item_postags.append(item["upostag"]) outfile.write(" ".join(item_postags) + ", " + file.split("-")[0] + "\n") except conllu.parser.ParseException: print(file.split("-")[0]) print(item)
def testNormal(self): content = [ u'# newdoc id = doc1', u'# newpar id = par1', u'# sent_id = 1', u'# text = Результаты \xa0(\xa0 нет', u'1 Результаты _ _ _ _ _ _ _ SpacesAfter=\\s\\xa0', u'2 ( _ _ _ _ _ _ _ SpacesAfter=\\xa0\\s', u'3 нет _ _ _ _ _ _ _ SpaceAfter=No', u'', u'', ] sentences = parse('\n'.join(content)) sentences = split_sent(sentences[0]) result = [extract_text(s, validate=False) for s in sentences] expected = [ [u'Результаты', u' \xa0', u'(', u'\xa0 ', u'нет'], ] self.assertEqual(len(expected), len(result)) for e, r in zip(expected, result): self.assertListEqual(e, r)
def return_Sidorov(self, text, key): sentences = self.tokenize(text) # Then, we perform tagging and parsing for each sentence for s in sentences: self.tag(s) # inplace tagging self.parse(s) # inplace parsing conllu_txt = self.write(sentences, "conllu") # conllu|horizontal|vertical # print(conllu_txt) output = "FLAGSidorov" + key + " " conllu_obj = conllu.parse(conllu_txt) for i in range(0, len(sentences)): for item in conllu_obj[i]: if item["head"] is not None: for item_loop in conllu_obj[i]: if item_loop["id"] == item["head"]: output += " " + item_loop[key] + item[key] return output.replace(".", "PUNCT").replace(":", "")
def score_verbs(self, sentence): sentences = self.udpipe_model.tokenize(sentence) for s in sentences: self.udpipe_model.tag(s) self.udpipe_model.parse(s) conllu_txt = self.udpipe_model.write( sentences, "conllu") #conllu|horizontal|vertical conllu_obj = conllu.parse(conllu_txt) words_id = [] relations = [] for word in conllu_obj[0]: if str(word['upostag']).lower() == 'verb': words_id.append(word['id']) if len(words_id): for word in conllu_obj[0]: if word['head'] in words_id or word['id'] in words_id: relations.append(word['deprel']) BOR = self.count_vec.transform([' '.join(relations) ]).toarray().tolist() return BOR[0]
def generate_wordlist(input_dir, output_dir): """ This function generate a wordlist in json format by taking in a .conllu file and extracting unique words from it. Args: input_dir: the position of the input .conllu file. output_dir: the position of the output wordlist.json. Returns: A list of strings, each string represents a token. """ source = conllu.parse(open(input_dir, "r").read()) wordlist = [] for item in source: wordlist.extend([a["form"] for a in item]) wordlist = list(set(wordlist)) with open(output_dir, "w") as f: json.dump(wordlist, f)
def rule_main(args, data): file_name = 'rule_based/example.conllu' if args.dataset == 'CoQA': score = 0 data = data[:5] length = len(data) with codecs.open(file_name, 'r', encoding='utf-8') as f: conllu_file = parse(f.read()) # Creating dict ids = range(int(len(conllu_file) / 2)) examples = {} count = 0 for i, s in enumerate(conllu_file): if i % 2 == 0: examples[ids[count]] = s else: examples[str(ids[count]) + '_answer'] = s count += 1 current_pos = 0 for data_ in data: summary = data_['summary'] generate_summary = '' for i in range(len(data_['questions'])): generate_summary += qa2d(current_pos, examples) + ' ' current_pos += 1 score += get_score(generate_summary, summary, args.score) elif args.dataset == 'QuAC': score = 0 length = len(data) else: score = 0 length = len(data) score = score/length print('summary score: ', score)
def test_parse_CoNLL2009(self): field_parsers = DEFAULT_FIELD_PARSERS.copy() field_parsers.update({ "pfeats": lambda line, i: parse_dict_value(line[i]), "phead": lambda line, i: parse_int_value(line[i]), "apreds": lambda line, i: TestParseCoNLL2009.parse_apreds(line[i:len(line)]), }) from tests.fixtures import TESTCASES_CONLL2009 sentences = parse( TESTCASES_CONLL2009[0], fields=('id', 'form', 'lemma', 'plemma', 'pos', 'ppos', 'feats', 'pfeats', 'head', 'phead', 'deprel', 'pdeprel', 'fillpred', 'pred', 'apreds'), field_parsers=field_parsers, ) self.assertEqual( sentences[0][2], OrderedDict([('id', 3), ('form', 'knihy'), ('lemma', 'kniha'), ('plemma', 'kniha'), ('pos', 'N'), ('ppos', 'N'), ('feats', OrderedDict([('SubPOS', 'N'), ('Gen', 'F'), ('Num', 'S'), ('Cas', '2'), ('Neg', 'A')])), ('pfeats', OrderedDict([('SubPOS', 'N'), ('Gen', 'F'), ('Num', 'S'), ('Cas', '2'), ('Neg', 'A')])), ('head', 1), ('phead', 1), ('deprel', 'Adv'), ('pdeprel', 'Adv'), ('fillpred', 'Y'), ('pred', 'kniha'), ('apreds', [ None, None, None, None, None, None, None, 'DIR1', None, None, None, None, None, None, None, None ])]))
def _generate_examples(self, data_dir, subarchive_path, files): counter = 0 for path, f in files: if path == subarchive_path: stream = tarfile.open(fileobj=f, mode="r|*") for tarinfo in stream: file_path = tarinfo.name if file_path.startswith(data_dir) and file_path.endswith( ".conllu"): data = stream.extractfile(tarinfo).read().decode( "utf-8") for sent in conllu.parse(data): res = { "idx": sent.metadata["sent_id"], "text": sent.metadata["text"], "tokens": [str(token["form"]) for token in sent], "lemmas": [str(token["lemma"]) for token in sent], "pos_tags": [str(token["upostag"]) for token in sent], "xpos_tags": [str(token["xpostag"]) for token in sent], "feats": [str(token["feats"]) for token in sent], "head": [str(token["head"]) for token in sent], "deprel": [str(token["deprel"]) for token in sent], "deps": [str(token["deps"]) for token in sent], "misc": [str(token["misc"]) for token in sent], } yield counter, res counter += 1 stream.members = [] del stream break
def return_deprelnegations(self, text): sentences = self.tokenize(text) # Then, we perform tagging and parsing for each sentence for s in sentences: self.tag(s) # inplace tagging self.parse(s) # inplace parsing conllu_txt = self.write(sentences, "conllu") # conllu|horizontal|vertical # print(conllu_txt) output = "FLAGdeprelnegations " conllu_obj = conllu.parse(conllu_txt) for i in range(0, len(sentences)): for item in conllu_obj[i]: if item['feats'] is not None: for key, value in item['feats'].items(): if value == "Neg": output += " " + item['deprel'] return output
def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) sentences = parse(open(data_path, 'r').read()) for sentence in sentences: for word in sentence: form = word['form'] pos = word['upostag'] type = word['deprel'] real_word = form.split('_BERT_')[0] for char in real_word: char_alphabet.add(char) form = DIGIT_RE.sub("0", form) if normalize_digits else form pos_alphabet.add(pos) type_alphabet.add(type) if form not in vocab_set and (form in embedd_dict or form.lower() in embedd_dict): vocab_set.add(form) vocab_list.append(form)
def case_spacy_german(): data = """ # sent_id = testtext.1 1 Dies Dies PRON PDS _ 2 sb _ _ 2 ist sein AUX VAFIN _ 0 ROOT _ _ 3 ein einen DET ART _ 4 nk _ _ 4 Test Test NOUN NN _ 2 pd _ _ 5 . . PUNCT $. _ 2 punct _ _ """ result = conllu.parse(data) return ( wikiannotator.Annotator.createAnnotator('spacy', {'model_name': 'de_core_news_sm'}), wikiannotator.SpacyAnnotator, { 'text': 'Dies ist ein Test.', 'textname': 'testtext', 'parse': result } )
def split(path, out_path=None, n_parts=5, rng=None): """Split a CoNLL-U file into parts.""" if out_path is None: out_path = Path("output") if rng is None: rng = random.Random() out_path.mkdir() print(f"Reading {path}", file=sys.stderr) with open(path, encoding="utf-8") as f: sents = parse(f.read()) rng.shuffle(sents) count = [0] * n_parts for i, sent in enumerate(tqdm(sents)): count[i % n_parts] += 1 with open(out_path / f"{i % n_parts:02}.conllu", "a", encoding="utf-8") as f: print(sent.serialize(), file=f, end="") for i, cnt in enumerate(count): print(f"Part {i:02}: {cnt}", file=sys.stderr)
def transduce(corpus, trans, control='normal', pos='upos'): """ args: corpus: TextIOWrapper, the corpus to process trans: tbtk.TransitionSystemBase object control: str, format of the transition, backbone or normal pos: str, which pos column to choose, upos or xpos """ sentences = [tbtk.ConllSent.from_conllu(sent) for sent in conllu.parse(corpus.read())] for s in sentences: transitions = [] state = tbtk.State.init_from_sent(s) while not state.is_final(): g = trans.gold_action(state) transitions.append(trans.action_to_str(g)) state = trans.step(state, g) s.transitions = transitions print("form:", ' '.join(s.form)) print("pos:", ' '.join(s.upos if pos == 'upos' else s.xpos)) print("head:", ' '.join(str(x) for x in s.head)) print("deprel:", ' '.join(s.deprel)) print("transitions:", ' '.join(s.transitions)) print()
def testComplex(self): content = [ u'# newdoc id = doc1', u'# newpar id = par1', u'# sent_id = 1', u'# text = Результаты.Выводы', u'1 Результаты _ _ _ _ _ _ _ SpaceAfter=No', u'2 . _ _ _ _ _ _ _ SentenceBreak=Yes|SpaceAfter=No', u'3 Выводы _ _ _ _ _ _ _ SpaceAfter=No', u'', u'', ] sentences = parse('\n'.join(content)) sentences = split_sent(sentences[0]) result = [extract_text(s, validate=False) for s in sentences] expected = [ [u'Результаты', u'.'], [u'Выводы'], ] self.assertEqual(len(expected), len(result)) for e, r in zip(expected, result): self.assertListEqual(e, r)
def load_conllu(conllu_file): conllu_data = [] with open(conllu_file, 'r', encoding='utf-8') as content_file: content = content_file.read() sentences = parse(content) for idx, sentence in enumerate(sentences): tokens, upos, head, deprel, offset = [], [], [], [], [] reserved_offsets = [] for widx, word in enumerate(sentence): if isinstance(word['id'], tuple): # multi-word token, e.g., word['id'] = (4, '-', 5) assert len(word['id']) == 3 indices = word['misc']['TokenRange'].split(':') reserved_offsets.append([int(indices[0]), int(indices[1])]) else: tokens.append(word['form']) upos.append(word['upostag']) head.append(word['head']) deprel.append(word['deprel']) if word['misc'] is not None: # single-word token indices = word['misc']['TokenRange'].split(':') offset.append([int(indices[0]), int(indices[1])]) elif len(reserved_offsets) > 0: offset.append(reserved_offsets.pop()) else: offset.append([-1, -1]) assert len(tokens) == len(offset) sent_obj = OrderedDict([('id', sentence.metadata['sent_id']), ('text', sentence.metadata['text']), ('word', tokens), ('upos', upos), ('head', head), ('deprel', deprel), ('offset', offset)]) conllu_data.append(sent_obj) return conllu_data
def joinParse(data_file): data = data_file.read() sentences = parse(data) l = len(sentences) entireList = [[] for i in range(l)] indexes = [] for i in range(l): sent = [(word['form'], word['upostag'], sentences[i].metadata) for word in sentences[i]] sent2 = [(word['form'], '<pad>', None) for word in sentences[i]] m = max(0, i - args.sngram // 2) M = min(l, i + args.sngram // 2 + args.sngram % 2) for j in range(m, M): if len(entireList[j]) == 0: entireList[j].append(('[CLS]', '<pad>', None)) if i == j: indexes.append( [len(entireList[j]), len(entireList[j]) + len(sent)]) entireList[j] += sent + [('[SEP]', '<pad>', None)] else: entireList[j] += sent2 + [('[SEP]', '<pad>', None)] return entireList, indexes
def create_from_ud(cls, data_file_list): """Initialize corpus from a path to a file in conllu format""" corpus = POSCorpus() corpus.sentences = [] for data_file_path in data_file_list: with open(data_file_path, "r", encoding="utf-8") as data_file: data = data_file.read() data = conllu.parse(data) for token_list in data: sentence = [] for token in token_list: pos = token['upostag'] lemma = token['lemma'] word = token['form'] # Sometimes the corpus doesn't have words, only underscores if word == '_' or lemma == '_': continue sentence.append({'word': word, 'lemma': lemma, 'pos': pos}) if len(sentence) > 0: corpus.sentences.append(sentence) return corpus
def count_frequency( self, files, ): """ Count frequencies for all lemmas in corpus :param files: list with paths to all conllu files in corpus :return: list with all types in corpus, sorted descending by frequency """ list_of_words = [] for file in files: with open(file, "r") as fin: data = fin.read() for sentence in parse(data): for word in sentence: list_of_words.append(word["lemma"]) fin.close() counted = Counter(list_of_words) frequency_list = sorted( counted, key=counted.__getitem__, reverse=True, ) return frequency_list
def extract_sentences_from_conllu_to_csv(src_dir, name, dst_dir): """Extract sentences from Universal Dependency structures in .conllu file and store in .csv file. Parameters ---------- src_dir : str directory of .conllu file name : str name of .conllu file excluding '-train.conllu' or '-test.conllu' mode : str type of conllu file, 'train', 'test' etc. dst_dir : str directory where .csv file should be saved """ filename = src_dir + name + '.conllu' print('Processing ' + filename + ' ... ', end='', flush=True) raw_data = open(filename, "r", encoding="utf-8").read() ud_dataset = parse(raw_data) sentences = [] for tokenlist in ud_dataset: sentence = [] for token in tokenlist: word = token['form'] sentence.append(word) sentences.append(sentence) filename = dst_dir + name + '.csv' with open(filename, mode="w", encoding="utf-8", newline='') as fp: csv_writer = csv.writer(fp, delimiter=' ') csv_writer.writerows(sentences) print('DONE ', end='') print((len(ud_dataset), len(sentences)))
def load_conllu(conllu_file): conllu_data = dict() with open(conllu_file, 'r', encoding='utf-8') as content_file: content = content_file.read() sentences = parse(content) for idx, sentence in enumerate(sentences): tokens, upos, head, deprel = [], [], [], [] for widx, word in enumerate(sentence): if isinstance(word['id'], tuple): # multi-word token, e.g., word['id'] = (4, '-', 5) assert len(word['id']) == 3 else: tokens.append(word['form']) upos.append(word['upostag']) head.append(word['head']) deprel.append(word['deprel']) sent_obj = OrderedDict([('id', sentence.metadata['sent_id']), ('text', sentence.metadata['text']), ('word', tokens), ('upos', upos), ('head', head), ('deprel', deprel)]) conllu_data[sent_obj['id']] = sent_obj return conllu_data
def load_as_conllu(self, predefined_splits: bool = False): """ :param bool predefined_splits: Boolean :return: A single parsed conllu list or a list of train, dev, test split parsed conllu list depending on predefined_split """ with open('{}/CDT_coref.conllu'.format(self.dataset_dir)) as f: conlist = conllu.parse(f.read(), fields=["id", "form", "lemma", "upos", 'xpos', 'feats', 'head', 'deprel','deps', 'misc', 'coref_id', 'coref_rel', 'doc_id', 'qid']) if predefined_splits==False: return conlist parts = [None, None, None] sent_parts = [[], [], []] for i, part in enumerate(['train', 'dev', 'test']): with open('{}/CDT_{}_ids.json'.format(self.dataset_dir,part)) as f: parts[i] = json.load(f) for sentence in conlist: if sentence[0]["doc_id"] in parts[i]: sent_parts[i].append(sentence) return sent_parts
def text_to_json(text, model, sep="|", parse_system="udpipe"): """ Parameters ---------- text: str model: Model sep: str parse_system: str Returns ------- l_sentences: list """ if parse_system == "spacy": import spacy # Need installed English lang: python -m spacy download en nlp = spacy.load('en') tokens = nlp(text) tmp_text = str(text) posStart_prev = 0 l_sentences = [] for sent in tokens.sents: l_sent = [] for word in sent: posStart = str.find(tmp_text, word.text, posStart_prev) posStart_prev = posStart d_word = { "id": word.orth, "forma": word.text, "lemma": word.lemma_, "norm": word.norm_, "pos": word.pos_, 'tag': word.tag_, "grm": "", "len": word.__len__(), "posStart": posStart, "dom": word.head.text, "link": word["deprel"] } l_sent.append(d_word) l_sentences.append(l_sent) elif parse_system == "udpipe": segmented = ufal.udpipe.Pipeline(model.model, "tokenize", ufal.udpipe.Pipeline.NONE, ufal.udpipe.Pipeline.NONE, "").process(text) sentences = model.read(segmented, "conllu") for sent in sentences: model.tag(sent) model.parse(sent) res_conllu = model.write(sentences, "conllu") tmp_text = str(text) posStart_prev = 0 l_sentences = [] for sent in conllu.parse(res_conllu): l_sent = [] for word in sent: posStart = str.find(tmp_text, word["form"], posStart_prev) posStart_prev = posStart d_word = { "id": word["id"], "forma": word["form"], "lemma": word["lemma"], "pos": word["upostag"], "grm": get_feats_string(word["feats"], sep=sep), "len": len(word["form"]), "posStart": posStart, "dom": word["head"], "link": word["deprel"] } l_sent.append(d_word) l_sentences.append(l_sent) else: print("Error. Unsupported parsing system. Use 'conll' or 'spacy'.") return l_sentences