def extract_full_name(nlp_doc): pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] matcher.add('FULL_NAME', None, pattern) matches = matcher(nlp_doc) for match_id, start, end in matches: span = nlp_doc[start:end] return span.text
def emojis(self): """ Emojis detected using SpaCy matcher over the cleaned content, with unicode name and sentiment score. >>> Doc('Test with emoji 😀 😋 ').emojis [('😀', 'GRINNING FACE', 0.571753986332574), ('😋', 'FACE SAVOURING DELICIOUS FOOD', 0.6335149863760218)] """ matcher = spacy.matcher.Matcher(self._spacy_doc.vocab) for emoji, unicode_name in emoji2unicode_name.items(): matcher.add(unicode_name, None, ({'ORTH': emoji}, )) return [(emoji, unicode_name, emoji2sentiment[emoji]) for emoji, unicode_name in self.match(matcher)]
def emojis(self): """ Emojis detected using SpaCy matcher over the cleaned content, with unicode name and sentiment score. >>> from pprint import pprint >>> from textpipe.doc import Doc >>> pprint(Doc('Test with emoji 😀 😋 ').emojis) [('😀', 'GRINNING FACE', 0.571753986332574), ('😋', 'FACE SAVOURING DELICIOUS FOOD', 0.6335149863760218)] """ matcher = spacy.matcher.Matcher(self._spacy_doc.vocab) for emoji, unicode_name in EMOJI_TO_UNICODE_NAME.items(): matcher.add(unicode_name, None, ({'ORTH': emoji}, )) return [(emoji, unicode_name, EMOJI_TO_SENTIMENT[emoji]) for emoji, unicode_name in self.match(matcher)]
def extract_phone_number(nlp_doc): pattern = [{ 'ORTH': '(' }, { 'SHAPE': 'ddd' }, { 'ORTH': ')' }, { 'SHAPE': 'ddd' }, { 'ORTH': '-', 'OP': '?' }, { 'SHAPE': 'ddd' }] matcher.add('PHONE_NUMBER', None, pattern) matches = matcher(nlp_doc) for match_id, start, end in matches: span = nlp_doc[start:end] return span.text
def test_matcher_segfault(): nlp = spacy.load('en', parser=False, entity=False) matcher = spacy.matcher.Matcher(nlp.vocab) content = u'''a b; c''' matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]]) matcher(nlp(content)) matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]]) matcher(nlp(content)) matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]]) matcher(nlp(content))
def test_matcher_segfault(): nlp = spacy.load('en', parser=False, entity=False) matcher = spacy.matcher.Matcher(nlp.vocab) content = u'''a b; c''' matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ ORTH: 'a' }, { ORTH: 'b' }]]) matcher(nlp(content)) matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ ORTH: 'a' }, { ORTH: 'b' }, { IS_PUNCT: True }, { ORTH: 'c' }]]) matcher(nlp(content)) matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ ORTH: 'a' }, { ORTH: 'b' }, { IS_PUNCT: True }, { ORTH: 'd' }]]) matcher(nlp(content))
def main(): in_file = codecs.open('Mercier_1600-1837.txt').read() relationship_set = RelationshipHandler() merger = quotemerger.HyphenatedNameMerger(nlp.vocab) nlp.add_pipe(merger.merger, first=True) matcher.add('FATHER_SON_1', relationship_set.handle_fs_1, MATCHERS['FATHER_SON_1']) matcher.add('FATHER_SON_2', relationship_set.handle_fs_2, MATCHERS['FATHER_SON_2']) matcher.add('FATHER_SON_3', relationship_set.handle_fs_3, MATCHERS['FATHER_SON_3']) matcher.add('FATHER_DAUGHTER_1', relationship_set.handle_fd_1, MATCHERS['FATHER_DAUGHTER_1']) matcher.add('FATHER_DAUGHTER_2', relationship_set.handle_fd_2, MATCHERS['FATHER_DAUGHTER_2']) matcher.add('FATHER_DAUGHTER_3', relationship_set.handle_fd_3, MATCHERS['FATHER_DAUGHTER_3']) matcher.add('FATHER_DAUGHTER_4', relationship_set.handle_fd_4, MATCHERS['FATHER_DAUGHTER_4']) matcher.add('MARIAGE_1', relationship_set.handle_mariage_1_and_2, MATCHERS['MARIAGE_1']) matcher.add('MARIAGE_2', relationship_set.handle_mariage_1_and_2, MATCHERS['MARIAGE_2']) matcher.add('MARIAGE_3', relationship_set.handle_mariage_3_and_4, MATCHERS['MARIAGE_3']) matcher.add('MARIAGE_4', relationship_set.handle_mariage_3_and_4, MATCHERS['MARIAGE_4']) matcher.add('MARIAGE_5', relationship_set.handle_mariage_5, MATCHERS['MARIAGE_5']) matcher.add('MARIAGE_6', relationship_set.handle_mariage_6, MATCHERS['MARIAGE_6']) matcher.add('GENDRE_1', relationship_set.handle_gendre_1, MATCHERS['GENDRE_1']) matcher.add('GENDRE_2', relationship_set.handle_gendre_2, MATCHERS['GENDRE_2']) matcher.add('GENDRE_3', relationship_set.handle_gendre_3, MATCHERS['GENDRE_3']) matcher.add('PERE', relationship_set.handle_pere_1, MATCHERS['PERE']) parsed_doc = nlp(in_file) matches = matcher(parsed_doc) out_file = codecs.open('relations.csv', 'w', encoding='utf8') out_file.write('parent,child,relation type\n') for rel in relationship_set.relationships: if isinstance(rel, FatherSonRelationship): out_file.write('{},{},son\n'.format(rel.father, rel.son)) print('{} is the father of {}'.format(rel.father, rel.son)) elif isinstance(rel, FatherDaughterRelationship): out_file.write('{},{},daughter\n'.format(rel.father, rel.daughter)) print('{} is the father of {}'.format(rel.father, rel.daughter)) elif isinstance(rel, GendreRelationship): out_file.write('{},{},daughter'.format(rel.father, rel.name)) out_file.write('{},{},spouse'.format(rel.husband, rel.name)) print('An unnamed woman is the daughter of {} and husband of {}'. format(rel.father, rel.husband)) out_file.close() print('Matchers done')