def test_find_head_long_phrases(self): token_ids = [x for x in range(1, 6)] doc = OrderedDict({ 'dependencies': [{ 'style': 'universal', 'arcs': { 1: [{ 'governor': 2 }], 2: [{ 'governor': 3 }], 3: [{ 'governor': 4 }], 4: [{ 'governor': 0 }], 5: [{ 'governor': 4 }], } }] }) actual = pyjsonnlp.find_head(doc, token_ids, 'universal') assert 4 == actual, actual
def test_find_head(self): token_ids = [1] doc = OrderedDict({ 'dependencies': [{ 'style': 'universal', 'arcs': { 1: [{ 'governor': 2 }], 2: [{ 'governor': 3 }], 3: [{ 'governor': 4 }], 4: [{ 'governor': 0 }], 5: [{ 'governor': 4 }], } }] }) actual = pyjsonnlp.find_head(doc, token_ids, 'universal') assert 1 == actual, actual
def test_find_head_no_deps(self): no_deps = OrderedDict(j['documents'][0]) no_deps['dependencies'] = [] with pytest.raises(ValueError): pyjsonnlp.find_head(no_deps, [], 'universal')
def test_find_head_no_enhanced(self): with pytest.raises(ValueError): pyjsonnlp.find_head(OrderedDict(), [], 'Enhanced++')
def test_find_head_style_not_found(self): no_deps = OrderedDict(j['documents'][0]) no_deps['dependencies'] = [] with pytest.raises(ValueError): pyjsonnlp.find_head(no_deps, [], 'no such style')
def process(text: str = '', spacy_model='en_core_web_sm', coreferences=False, constituents=False, dependencies=True, expressions=True) -> OrderedDict: """Process provided text""" nlp = get_model(spacy_model, coreferences, constituents) nlp.tokenizer = SyntokTokenizer(nlp.vocab) doc = nlp(text) j: OrderedDict = get_base() d: OrderedDict = get_base_document(1) j['documents'].append(d) d['meta']['DC.source'] = 'SpaCy {}'.format(spacy.__version__) d['text'] = text model_lang = spacy_model[0:2] lang = Counter() # track the frequency of each language sent_lookup: Dict[int, int] = {} # map sentence end_char to our index token_lookup: Dict[Tuple[int, int], int] = { } # map (sent_id, spacy token index) to our token index # tokens and sentences token_id = 1 sent_num = 1 for sent in doc.sents: current_sent = { 'id': sent_num, 'tokenFrom': token_id, 'tokenTo': token_id + len(sent), # begin inclusive, end exclusive 'tokens': [] } if constituents: try: d['constituents'].append( build_constituents(sent_num, sent._.parse_string)) except Exception: pass sent_lookup[sent.end_char] = sent_num d['sentences'][current_sent['id']] = current_sent #d['sentences'].append(current_sent) last_char_index = 0 for token in sent: t = { 'id': token_id, 'sentence_id': sent_num, 'text': token.text, 'lemma': token.lemma_, 'xpos': token.tag_, 'upos': token.pos_, 'entity_iob': token.ent_iob_, 'characterOffsetBegin': token.idx, 'characterOffsetEnd': token.idx + len(token), 'lang': token.lang_, 'features': { 'Overt': True, 'Stop': True if token.is_stop else False, 'Alpha': True if token.is_alpha else False, }, 'misc': { 'SpaceAfter': False } } # shape if WORD_REGEX.findall(token.text): t['shape'] = token.shape_ # space after? if token.idx != 0 and token.idx != last_char_index: # we don't know there was a space after the previous token until we see where this one # starts in relation to where the last one finished d['tokenList'][token_id - 2]['misc']['SpaceAfter'] = True last_char_index = t['characterOffsetEnd'] # morphology for i, kv in enumerate( nlp.vocab.morphology.tag_map.get(token.tag_, {}).items()): if i > 0: # numeric k/v pair at the beginning t['features'][kv[0]] = str(kv[1]).title() # entities if token.ent_type_: t['entity'] = token.ent_type_ # maybe check if a non-model language if model_lang != 'xx': t['features'][ 'Foreign'] = False if model_lang == token.lang_ else True # bookkeeping lang[token.lang_] += 1 token_lookup[(sent_num, token.i)] = token_id current_sent['tokens'].append(token_id) d['tokenList'].append(t) token_id += 1 d['tokenList'][token_id - 2]['misc'][ 'SpaceAfter'] = True # EOS tokens have spaces after them sent_num += 1 d['tokenList'][token_id - 2]['misc']['SpaceAfter'] = False # EOD tokens do not # noun phrases if expressions: chunk_id = 1 for chunk in doc.noun_chunks: if len(chunk) > 1: sent_id = sent_lookup[chunk.sent.sent.end_char] d['expressions'].append({ 'id': chunk_id, 'type': 'NP', 'head': token_lookup[(sent_id, chunk.root.i)], 'dependency': chunk.root.dep_.lower(), 'tokens': [token_lookup[(sent_id, token.i)] for token in chunk] }) chunk_id += 1 # dependencies if dependencies: d['dependencies'] = [] for sent_num, sent in enumerate(doc.sents): deps = {'style': "universal", 'trees': []} for token in sent: dependent = token_lookup[(sent_num + 1, token.i)] deps['trees'].append({ #'sentenceId': sent_num+1, 'lab': token.dep_ if token.dep_ != 'ROOT' else 'root', 'gov': token_lookup[(sent_num + 1, token.head.i)] if token.dep_ != 'ROOT' else 0, 'dep': dependent }) d['dependencies'].append(deps) # coref # noinspection PyProtectedMember if coreferences and doc._.coref_clusters is not None: # noinspection PyProtectedMember for cluster in doc._.coref_clusters: r = build_coreference(cluster.i) r['representative']['tokens'] = [t.i + 1 for t in cluster.main] r['representative']['head'] = find_head( d, r['representative']['tokens'], d['tokenList'][max( r['representative']['tokens'])]['sentence_id'], 'universal') for m in cluster.mentions: if m[0].i + 1 in r['representative']['tokens']: continue # don't include the representative in the mention list ref = {'tokens': [t.i + 1 for t in m]} ref['head'] = find_head(d, ref['tokens'], sent_num + 1, 'universal') r['referents'].append(ref) d['coreferences'].append(r) d['meta']['DC.language'] = max(lang) return remove_empty_fields(j)
def process_conll(conll='', lang='en', coreferences=False, constituents=False, dependencies=False, expressions=False, **kwargs) -> OrderedDict: if conll == '': raise ValueError('You must pass something in the conll parameter!') x = load_xrenner() x.load(XrennerPipeline.iso2xrenner(lang)) x.set_doc_name('not-used') # needs to be set or error sgml_result = x.analyze(conll, 'sgml') j = parse_conllu(conll) #d = list(j['documents'].values())[0] d = j['documents'][0] d['meta']['DC.source'] = 'Xrenner 2.0' if coreferences: # wrap tokens with their token id so that xml parsing works token_num = 1 tokenized = [] for line in sgml_result.split('\n'): if line[0:9] != '<referent' and line[0:10] != '</referent': line = f'<token id="{token_num}">{line}</token>' token_num += 1 tokenized.append(line) representatives = {} coref_id = 0 soup = BeautifulSoup('\n'.join(tokenized), 'html.parser') for tag in soup.find_all('referent'): # new representative if 'antecedent' not in tag.attrs or tag['type'] == 'none': r = build_coreference(coref_id) coref_id += 1 r['representative'] = { 'entity': tag['entity'], 'tokens': [int(t['id']) for t in tag.find_all('token')] } r['representative']['head'] = find_head( d, r['representative']['tokens']) representatives[(tag['id'], tag['group'])] = r d['coreferences'].append(r) # might be a multi-word expression too! if expressions and tag['entity'] != 'event' and len( r['representative']['tokens']) > 1: d['expressions'].append({ # deduce the phrase type by the pos tag of the head token 'type': 'VP' if 'V' in d['tokenList'][ r['representative']['head']]['upos'] else 'NP', 'head': r['representative']['head'], 'tokens': r['representative']['tokens'] }) # new referent else: r = representatives[(tag['antecedent'], tag['group'])] ids = [int(t['id']) for t in tag.find_all('token')] r['referents'].append({ 'type': tag['type'], 'tokens': ids, 'head': find_head(d, ids) }) return remove_empty_fields(j)