def test_extract_(self): feature_accumulator = extract_features.make_feature_accumulator() article = corenlp_xml_reader.AnnotatedText( open('test/corenlp1.xml').read() ) feature_accumulator.extract(article) article = corenlp_xml_reader.AnnotatedText( open('test/corenlp2.xml').read() ) feature_accumulator.extract(article) feature_accumulator.write('test/merged-extracted')
def corenlp(self): if self.is_parc_or_annotator_training(): raise ArticleError( 'Sorry, no corenlp file available for annotator-training files ' 'and parc3-replication files.\n\nTry this instead:\n\n' '>>> polnear.data.train()[0].corenlp()') return corenlp_xml_reader.AnnotatedText( open(self.path('corenlp')).read())
def test_extract(self): article = corenlp_xml_reader.AnnotatedText( open('test/corenlp1.xml').read() ) feature_accumulator = extract_features.make_feature_accumulator() feature_accumulator.extract(article) # Test that the features extracted are the ones expected self.assert_feature_like_on_disc(feature_accumulator, 'test/extracted1')
def test_merge_load(self): article = corenlp_xml_reader.AnnotatedText( open('test/corenlp1.xml').read() ) feature_accumulator = extract_features.make_feature_accumulator() feature_accumulator.extract(article) feature_accumulator.merge_load('test/extracted2') self.assert_feature_like_on_disc( feature_accumulator, 'test/merged-extracted')
def test_normalized(self): feature_accumulator = extract_features.make_feature_accumulator() article = corenlp_xml_reader.AnnotatedText( open('test/corenlp1.xml').read() ) feature_accumulator.extract(article) normalized_features = { token : feature_accumulator.get_features(token, ['dependency']) for token in feature_accumulator.dictionary.get_token_list() } open('test/normalized1.json', 'w').write( json.dumps(normalized_features) )
def corenlp(self): if self.is_parc_or_annotator_training(): raise ArticleError('No corenlp file available.') return corenlp_xml_reader.AnnotatedText( open(self.path('corenlp')).read())