def __init__(self, ontology): self.en_core_web_lg = holmes.Manager('en_core_web_lg') self.en_core_web_lg_ontology = holmes.Manager( model='en_core_web_lg', ontology=ontology) self.de_core_news_sm = holmes.Manager('de_core_news_sm') self.en_coref_lg = holmes.Manager('en_coref_lg') self.en_coref_lg_ontology = holmes.Manager(model='en_coref_lg', ontology=ontology)
def test_deserialized_document_registration_multithreaded(self): def add_document(counter): m.deserialize_and_register_documents( {' '.join(('Irrelevant', str(counter))): irrelevant_doc}) normal_m = holmes.Manager('en_core_web_sm', perform_coreference_resolution=False) normal_m.parse_and_register_document("People discuss irrelevancies", 'irrelevant') irrelevant_doc = normal_m.serialize_document('irrelevant') m = holmes.MultiprocessingManager('en_core_web_sm', number_of_workers=4, perform_coreference_resolution=False) for i in range(NUMBER_OF_THREADS): t = Thread(target=add_document, args=(i, )) t.start() last_number_of_matches = 0 for counter in range(50): document_labels = m.document_labels() for label in document_labels: self.assertTrue(label.startswith("Irrelevant")) if len(document_labels) == NUMBER_OF_THREADS: break self.assertFalse(counter == 49) sleep(0.5)
def test_initial_question_word_embedding_match_threshold_out_of_range( self): with self.assertRaises(ValueError) as context: m = holmes.Manager('en_core_web_sm', number_of_workers=1) m.parse_and_register_document("a") coref_holmes_manager.topic_match_documents_against( "b", initial_question_word_embedding_match_threshold=-1.2)
def test_relation_threshold_too_low(self): with self.assertRaises(ValueError) as context: m = holmes.Manager('en_core_web_sm', number_of_workers=1) m.parse_and_register_document("a") coref_holmes_manager.topic_match_documents_against( "b", relation_matching_frequency_threshold=-0.75, embedding_matching_frequency_threshold=-0.5)
def test_embedding_threshold_less_than_relation_threshold(self): with self.assertRaises( EmbeddingThresholdLessThanRelationThresholdError) as context: m = holmes.Manager('en_core_web_sm', number_of_workers=1) m.parse_and_register_document("a") coref_holmes_manager.topic_match_documents_against( "b", relation_matching_frequency_threshold=0.75, embedding_matching_frequency_threshold=0.5)
def test_serialization_not_supported_on_serialization_multiprocessing( self): with self.assertRaises(SerializationNotSupportedError) as context: m_normal = holmes.Manager('en_core_web_sm', perform_coreference_resolution=False) m_normal.remove_all_documents() m_normal.parse_and_register_document("A", '') deserialized_doc = m_normal.serialize_document('') m = holmes.MultiprocessingManager('en_core_web_sm', number_of_workers=2) m.deserialize_and_register_documents({'A': deserialized_doc})
def test_embedding_threshold_higher_than_relation_threshold_normal_manager( self): with self.assertRaises( EmbeddingThresholdGreaterThanRelationThresholdError ) as context: m = holmes.Manager('en_core_web_sm') m.parse_and_register_document("a") coref_holmes_manager.topic_match_documents_returning_dictionaries_against( "b", maximum_number_of_single_word_matches_for_relation_matching=1, maximum_number_of_single_word_matches_for_embedding_matching=2)
def __init__( self, model, overall_similarity_threshold=1.0, embedding_based_matching_on_root_words=False, analyze_derivational_morphology=True, perform_coreference_resolution=None, debug=False, ): self.manager = holmes.Manager( model=model, ontology=None, overall_similarity_threshold=overall_similarity_threshold, embedding_based_matching_on_root_words= embedding_based_matching_on_root_words, analyze_derivational_morphology=analyze_derivational_morphology, perform_coreference_resolution=perform_coreference_resolution, debug=debug, )
def test_model_does_not_support_embeddings(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_sm', overall_similarity_threshold=0.85)
import unittest import holmes_extractor as holmes from holmes_extractor.extensive_matching import TopicMatcher import os script_directory = os.path.dirname(os.path.realpath(__file__)) ontology = holmes.Ontology(os.sep.join( (script_directory, 'test_ontology.owl')), symmetric_matching=True) holmes_manager_coref = holmes.Manager(model='en_core_web_lg', ontology=ontology, overall_similarity_threshold=0.65, perform_coreference_resolution=True) holmes_manager_coref_embedding_on_root = holmes.Manager( model='en_core_web_lg', ontology=ontology, overall_similarity_threshold=0.65, embedding_based_matching_on_root_words=True) holmes_manager_coref_no_embeddings = holmes.Manager( model='en_core_web_lg', ontology=ontology, overall_similarity_threshold=1, perform_coreference_resolution=True) class EnglishTopicMatchingTest(unittest.TestCase): def _check_equals(self, text_to_match, document_text, highest_score, manager): manager.remove_all_documents() manager.parse_and_register_document(document_text) topic_matches = manager.topic_match_documents_against(
def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching( self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.0, embedding_based_matching_on_root_words=True)
import unittest import holmes_extractor as holmes from holmes_extractor.errors import * import jsonpickle nocoref_holmes_manager = holmes.Manager('en_core_web_lg', analyze_derivational_morphology=False, perform_coreference_resolution=False) coref_holmes_manager = holmes.Manager('en_core_web_lg', perform_coreference_resolution=True) german_holmes_manager = holmes.Manager('de_core_news_md') class ErrorsTest(unittest.TestCase): def test_overall_similarity_threshold_out_of_range(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.2) def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching( self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.0, embedding_based_matching_on_root_words=True) def test_model_does_not_support_embeddings(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_sm', overall_similarity_threshold=0.85)
def test_overall_similarity_threshold_out_of_range(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.2)
def test_coreference_resolution_not_supported_error(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='de_core_news_md', perform_coreference_resolution=True)
import unittest import holmes_extractor as holmes import os from holmes_extractor.tests.testing_utils import HolmesInstanceManager script_directory = os.path.dirname(os.path.realpath(__file__)) ontology = holmes.Ontology(os.sep.join( (script_directory, 'test_ontology.owl'))) ontology_holmes_manager = HolmesInstanceManager( ontology).en_core_web_lg_ontology symmetric_ontology = holmes.Ontology(os.sep.join( (script_directory, 'test_ontology.owl')), symmetric_matching=True) symmetric_ontology_holmes_manager = holmes.Manager(model='en_core_web_lg', ontology=symmetric_ontology) no_ontology_coref_holmes_manager = holmes.Manager(model='en_coref_lg') class EnglishPhraseletProductionTest(unittest.TestCase): def _check_equals(self, manager, text_to_match, phraselet_labels, replace_with_hypernym_ancestors=True, match_all_words=False): manager.remove_all_search_phrases() doc = manager.semantic_analyzer.parse(text_to_match) manager.structural_matcher.register_phraselets( doc, replace_with_hypernym_ancestors=replace_with_hypernym_ancestors, match_all_words=match_all_words,
import urllib.request from bs4 import BeautifulSoup import holmes_extractor as holmes def download_and_register(url, label): print('Downloading', label) # Download the content page = urllib.request.urlopen(url) # Extract the raw text from the HTML document soup = BeautifulSoup(page, 'html.parser') # Register the document with Holmes print('Parsing and registering', label) holmes_manager.parse_and_register_document(soup.get_text(), label) # Start the Holmes Manager with the German model if __name__ in ('__main__', 'example_search_DE_law'): holmes_manager = holmes.Manager(model='de_core_news_lg', number_of_workers=2) download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008') download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG') holmes_manager.start_topic_matching_search_mode_console(initial_question_word_embedding_match_threshold=0.7) # Example queries: # # Der Versicherer darf den Vertrag fristlos kündigen, wenn der Versicherungsnehmer beim Abschluss des Vertrags die vorvertragliche Anzeigepflicht verletzt hat. # Der Versicherer darf Leistungen verweigern. # Der Versicherer darf die Prämie anpassen. # Eine Richtlinie einer ENTITYORG
def test_unrecognized_initial_question_word_behaviour(self): with self.assertRaises(ValueError) as context: m = holmes.Manager('en_core_web_sm', number_of_workers=1) m.parse_and_register_document("a") coref_holmes_manager.topic_match_documents_against( "b", initial_question_word_behaviour='r')
import unittest import holmes_extractor as holmes from holmes_extractor.errors import * import jsonpickle nocoref_holmes_manager = holmes.Manager('en_core_web_lg', perform_coreference_resolution=False) coref_holmes_manager = holmes.Manager('en_core_web_lg', perform_coreference_resolution=True) german_holmes_manager = holmes.Manager('de_core_news_md') class ErrorsTest(unittest.TestCase): def test_overall_similarity_threshold_out_of_range(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.2) def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching( self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.0, embedding_based_matching_on_root_words=True) def test_model_does_not_support_embeddings(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_sm', overall_similarity_threshold=0.85) def test_language_not_supported(self):
filename for filename in bbc_zipfile.namelist() if filename.lower().endswith('.txt') and not filename.endswith('README.TXT')): category, document_number = get_document_filename_info(filename) if is_training_data(document_number): with bbc_zipfile.open(filename, 'r') as training_doc: training_contents = str(training_doc.read()) training_contents = training_contents.replace('\n', ' ').replace('\r', ' ') training_basis.parse_and_register_training_document( training_contents, category, filename) training_basis.prepare() classifier = training_basis.train().classifier() output_filename = os.sep.join((working_directory, 'model.json')) with open(output_filename, "w") as file: file.write(classifier.serialize_model()) evaluate_classifier(zip_filename, classifier) holmes_manager = holmes.Manager('en_core_web_lg', number_of_workers=1) if os.path.exists(working_directory): if not os.path.isdir(working_directory): raise RuntimeError(' '.join((working_directory, 'must be a directory'))) else: os.mkdir(working_directory) zip_filename = (os.sep.join((working_directory, 'bbc-fulltext.zip'))) if not os.path.exists(zip_filename): url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip' with urllib.request.urlopen(url) as response, open(zip_filename, 'wb') as out_file: shutil.copyfileobj(response, out_file) model_filename = os.sep.join((working_directory, 'model.json')) if not os.path.exists(model_filename): train_model(working_directory, zip_filename) else:
import unittest import holmes_extractor as holmes import os script_directory = os.path.dirname(os.path.realpath(__file__)) ontology = holmes.Ontology(os.sep.join( (script_directory, 'test_ontology.owl'))) holmes_manager = holmes.Manager('de_core_news_md', ontology=ontology) class GermanPhraseletProductionTest(unittest.TestCase): def _check_equals(self, text_to_match, phraselet_labels, match_all_words=False, include_reverse_only=False, replace_with_hypernym_ancestors=False): doc = holmes_manager.semantic_analyzer.parse(text_to_match) phraselet_labels_to_phraselet_infos = {} holmes_manager.structural_matcher.add_phraselets_to_dict( doc, phraselet_labels_to_phraselet_infos= phraselet_labels_to_phraselet_infos, replace_with_hypernym_ancestors=replace_with_hypernym_ancestors, match_all_words=match_all_words, ignore_relation_phraselets=False, include_reverse_only=include_reverse_only, stop_lemmas=holmes_manager.semantic_analyzer. topic_matching_phraselet_stop_lemmas, reverse_only_parent_lemmas=holmes_manager.semantic_analyzer. topic_matching_reverse_only_parent_lemmas)
import unittest import holmes_extractor as holmes from holmes_extractor.errors import * import jsonpickle nocoref_holmes_manager = holmes.Manager('en_core_web_trf', analyze_derivational_morphology=False, perform_coreference_resolution=False, number_of_workers=2) coref_holmes_manager = holmes.Manager('en_core_web_trf', perform_coreference_resolution=True, number_of_workers=1) german_holmes_manager = holmes.Manager('de_core_news_lg', number_of_workers=1) class ErrorsTest(unittest.TestCase): def test_overall_similarity_threshold_out_of_range(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.2) def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching( self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.0, embedding_based_matching_on_root_words=True) def test_number_of_workers_out_of_range(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_sm', number_of_workers=0)
filename for filename in bbc_zipfile.namelist() if filename.lower().endswith('.txt') and not filename.endswith('README.TXT')): category, document_number = get_document_filename_info(filename) if is_training_data(document_number): with bbc_zipfile.open(filename, 'r') as training_doc: training_contents = str(training_doc.read()) training_contents = training_contents.replace('\n', ' ').replace('\r', ' ') training_basis.parse_and_register_training_document( training_contents, category, filename) training_basis.prepare() classifier = training_basis.train().classifier() output_filename = os.sep.join((working_directory, 'model.json')) with open(output_filename, "w") as file: file.write(classifier.serialize_model()) evaluate_classifier(zip_filename, classifier) holmes_manager = holmes.Manager('en_core_web_lg') if os.path.exists(working_directory): if not os.path.isdir(working_directory): raise RuntimeError(' '.join((working_directory), 'must be a directory')) else: os.mkdir(working_directory) zip_filename = (os.sep.join((working_directory, 'bbc-fulltext.zip'))) if not os.path.exists(zip_filename): url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip' with urllib.request.urlopen(url) as response, open(zip_filename, 'wb') as out_file: shutil.copyfileobj(response, out_file) model_filename = os.sep.join((working_directory, 'model.json')) if not os.path.exists(model_filename): train_model(working_directory, zip_filename) else:
import unittest import holmes_extractor as holmes holmes_manager = holmes.Manager(model='de_core_news_sm') holmes_manager.register_search_phrase("Ein Hund jagt eine Katze") holmes_manager.register_search_phrase("Ein Hund jagt einen Bären") holmes_manager.register_search_phrase("Ein Hund frisst einen Knochen") holmes_manager.register_search_phrase("Ein Mann ist schlau") holmes_manager.register_search_phrase("Der reiche Mann") holmes_manager.register_search_phrase("Jemand hat einen Berg gesehen") holmes_manager.register_search_phrase("Ein Student geht aus", "excursion") holmes_manager.register_search_phrase("Der Abschluss einer Versicherung") holmes_manager.register_search_phrase("Die Kündigung von einer Versicherung") holmes_manager.register_search_phrase("Jemand schließt eine Versicherung ab") holmes_manager.register_search_phrase("Wer war traurig?") holmes_manager.register_search_phrase("Das Fahrzeug hat einen Fehler") holmes_manager.register_search_phrase( "Jemand braucht eine Versicherung für fünf Jahre") holmes_manager.register_search_phrase("Jemand braucht etwas für fünf Jahre") holmes_manager.register_search_phrase("Jemand braucht für fünf Jahre") holmes_manager_with_variable_search_phrases = holmes.Manager( model='de_core_news_sm') class GermanStructuralMatchingTest(unittest.TestCase): def _get_matches(self, holmes_manager, text): holmes_manager.remove_all_documents() holmes_manager.parse_and_register_document(document_text=text) return holmes_manager.match() def test_direct_matching(self):
import unittest import holmes_extractor as holmes import os script_directory = os.path.dirname(os.path.realpath(__file__)) ontology = holmes.Ontology(os.sep.join( (script_directory, 'test_ontology.owl'))) nocoref_holmes_manager = holmes.Manager(model='en_core_web_lg', ontology=ontology, perform_coreference_resolution=False) nocoref_holmes_manager.register_search_phrase("A dog chases a cat") nocoref_holmes_manager.register_search_phrase("The man was poor") nocoref_holmes_manager.register_search_phrase("The rich man") nocoref_holmes_manager.register_search_phrase("Someone eats a sandwich") nocoref_holmes_manager.register_search_phrase("A colleague's computer") nocoref_holmes_manager.register_search_phrase( "An ENTITYPERSON opens an account") nocoref_holmes_manager.register_search_phrase("A dog eats a bone") nocoref_holmes_manager.register_search_phrase("Who fell asleep?") nocoref_holmes_manager.register_search_phrase("Who is sad?") nocoref_holmes_manager.register_search_phrase("Insurance for years") nocoref_holmes_manager.register_search_phrase( "An employee needs insurance for the next five years") nocoref_holmes_manager.register_search_phrase( "Somebody gives a file to an employee") nocoref_holmes_manager.register_search_phrase("Somebody gives a boss a file") nocoref_holmes_manager.register_search_phrase("Serendipity") nocoref_holmes_manager.register_search_phrase("Somebody eats at an office") nocoref_holmes_manager.register_search_phrase("A holiday is hard to book") nocoref_holmes_manager.register_search_phrase("A man sings") nocoref_holmes_manager.register_search_phrase("Somebody finds insurance")
def test_language_not_supported(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='fr_core_news_sm')
import urllib.request from bs4 import BeautifulSoup import holmes_extractor as holmes def download_and_register(url, label): print('Downloading', label) # Download the content page = urllib.request.urlopen(url) # Extract the raw text from the HTML document soup = BeautifulSoup(page, 'html.parser') # Register the document with Holmes print('Parsing and registering', label) holmes_manager.parse_and_register_document(soup.get_text(), label) # Start the Holmes Manager with the German model holmes_manager = holmes.Manager(model='de_core_news_sm') download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008') download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG') holmes_manager.start_search_mode_console()
(script_directory, 'test_ontology.owl'))) coref_holmes_manager = HolmesInstanceManager(ontology).en_coref_lg_ontology coref_holmes_manager.register_search_phrase("A dog chases a cat") coref_holmes_manager.register_search_phrase("A big horse chases a cat") coref_holmes_manager.register_search_phrase("A tiger chases a little cat") coref_holmes_manager.register_search_phrase("A big lion chases a cat") coref_holmes_manager.register_search_phrase("An ENTITYPERSON needs insurance") coref_holmes_manager.register_search_phrase("University for four years") coref_holmes_manager.register_search_phrase("A big company makes a loss") coref_holmes_manager.register_search_phrase( "A dog who chases rats chases mice") coref_holmes_manager.register_search_phrase("A tired dog") coref_holmes_manager.register_search_phrase("A panther chases a panther") coref_holmes_manager.register_search_phrase("A leopard chases a leopard") no_coref_holmes_manager = holmes.Manager(model='en_coref_lg', ontology=ontology, perform_coreference_resolution=False) no_coref_holmes_manager.register_search_phrase("A dog chases a cat") embeddings_coref_holmes_manager = holmes.Manager( model='en_coref_lg', overall_similarity_threshold=0.85) embeddings_coref_holmes_manager.register_search_phrase('A man loves a woman') class CoreferenceEnglishMatchingTest(unittest.TestCase): def _check_word_match(self, match, word_match_index, document_token_index, extracted_word): word_match = match.word_matches[word_match_index] self.assertEqual(word_match.document_token.i, document_token_index) self.assertEqual(word_match.extracted_word, extracted_word) def test_simple_pronoun_coreference_same_sentence(self):
def test_deserialized_documents(self): normal_manager = holmes.Manager('en_core_web_sm', perform_coreference_resolution=False) normal_manager.parse_and_register_document( "I saw a dog. It was chasing a cat", 'specific') normal_manager.parse_and_register_document("The dog chased the animal", 'exact') normal_manager.parse_and_register_document("The cat chased the dog", 'specific-reversed') normal_manager.parse_and_register_document("The animal chased the dog", 'exact-reversed') specific = normal_manager.serialize_document('specific') exact = normal_manager.serialize_document('exact') specific_reversed = normal_manager.serialize_document( 'specific-reversed') exact_reversed = normal_manager.serialize_document('exact-reversed') m = holmes.MultiprocessingManager('en_core_web_sm', ontology=ontology, number_of_workers=2, verbose=False, perform_coreference_resolution=False) m.deserialize_and_register_documents({ 'specific': specific, 'exact': exact, 'specific-reversed': specific_reversed, 'exact-reversed': exact_reversed }) self.assertEqual( m.document_labels(), ['exact', 'exact-reversed', 'specific', 'specific-reversed']) self.assertEqual( m.topic_match_documents_returning_dictionaries_against( "A dog chases an animal"), [{ 'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 99.34666666666668, 'word_infos': [[ 4, 7, 'overlapping_relation', False, "Matches DOG directly." ], [ 8, 14, 'overlapping_relation', False, "Matches CHASE directly." ], [ 19, 25, 'overlapping_relation', True, "Matches ANIMAL directly." ]] }, { 'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '2=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 35.39866666666667, 'word_infos': [[ 4, 10, 'single', False, "Matches ANIMAL directly." ], [11, 17, 'relation', False, "Matches CHASE directly."], [ 22, 25, 'relation', True, "Is a child of ANIMAL in the ontology." ]] }, { 'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '2=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 22, 'score': 34.486666666666665, 'word_infos': [[ 4, 7, 'single', False, "Is a child of ANIMAL in the ontology." ], [8, 14, 'relation', False, "Matches CHASE directly."], [ 19, 22, 'relation', True, "Is a child of ANIMAL in the ontology." ]] }, { 'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 33, 'score': 31.88346666666667, 'word_infos': [[8, 11, 'single', False, "Matches DOG directly."], [ 20, 27, 'relation', False, "Is a synonym of CHASE in the ontology." ], [ 30, 33, 'relation', True, "Is a child of ANIMAL in the ontology." ]] }]) m.close()
with open(long_filename, "r") as f: contents = f.read() serialized_documents[label] = contents holmes_manager.deserialize_and_register_documents(serialized_documents) if os.path.exists(working_directory): if not os.path.isdir(working_directory): raise RuntimeError(' '.join((working_directory), 'must be a directory')) else: os.mkdir(working_directory) labels_to_documents={} if os.path.isfile(flag_filename): load_documents_from_working_directory(labels_to_documents) else: normal_holmes_manager = holmes.Manager(model='de_core_news_md') process_documents_from_front_page(normal_holmes_manager, "https://maerchen.com/grimm/", 'Gebrüder Grimm', labels_to_documents) process_documents_from_front_page(normal_holmes_manager, "https://maerchen.com/grimm2/", 'Gebrüder Grimm', labels_to_documents) process_documents_from_front_page(normal_holmes_manager, "https://maerchen.com/andersen/", 'Hans Christian Andersen', labels_to_documents) process_documents_from_front_page(normal_holmes_manager, "https://maerchen.com/bechstein/", 'Ludwig Bechstein', labels_to_documents) process_documents_from_front_page(normal_holmes_manager, "https://maerchen.com/wolf/", 'Johann Wilhelm Wolf', labels_to_documents) # Generate flag file to indicate files can be reloaded on next run open(flag_filename, 'a').close() load_documents_from_working_directory(labels_to_documents) #Comment following line in to activate interactive console
def test_number_of_workers_out_of_range(self): with self.assertRaises(ValueError) as context: holmes.Manager(model='en_core_web_sm', number_of_workers=0)