class TaggerTestCase(unittest.TestCase): def setUp(self): self.tagger = BioEntityTagger() def testTaggerNLP(self): for i, text in enumerate(file('resources/test_abstract_nlp.txt')): print i for tag in self.tagger.tag(text.lower()): print tag, text[tag['start']:tag['end']] def testTaggerLexebi(self): for i, text in enumerate(file('resources/test_abstract_lexebi.txt')): print i # for tag in tagger.tag(text.lower()): # print tag, text[tag['start']:tag['end']] old_tags = set() lexebi_tags = set() tags = self.tagger.tag(text.lower()) for tag in tags: matched_text = text[tag['start']:tag['end']] print tag, matched_text if tag['reference_db'] == 'LEXEBI': lexebi_tags.add(matched_text) else: old_tags.add(matched_text) new_tags = lexebi_tags.difference(old_tags) print 'New tags identified : {}'.format(new_tags)
def init_models(self): steps_done = [] try: # steps_done.append('DOWNLOADING TEXTBLOB LITE CORPORA') # MIN_CORPORA = [ # 'brown', # Required for FastNPExtractor # 'punkt', # Required for WordTokenizer # 'wordnet', # Required for lemmatization # 'averaged_perceptron_tagger', # Required for NLTKTagger # ] # for each in MIN_CORPORA: # nltk.download(each) # # nltk.download() steps_done.append('STARTING NLPAnalysis') self.nlp = NLPAnalysis._init_spacy_english_language() steps_done.append('STARTING TAGGER') self._tagger = BioEntityTagger(partial_match=False) self.analyzers = [ DocumentAnalysisSpacy(self.nlp, tagger=self._tagger), NounChuncker() ] steps_done.append('NLP MODEL INITIALIZED') except: logging.exception('NLP MODEL INIT FAILED MISERABLY') steps_done.append('NLP MODEL INIT FAILED MISERABLY') self.analyzers = [] logging.info(steps_done)
def setUpClass(cls): cls.nlp = init_spacy_english_language() cls.tagger = BioEntityTagger(partial_match=False)
def init_tagger(self): self.tagger = BioEntityTagger()
def setUp(self): self.tagger = BioEntityTagger()