class TestHtmlRequirementProcessor(TestCase): def setUp(self): self.v = Variables() self.hrp = HtmlRequirementProcessor(self.v.url, self.v.n_cat) def it_receives_a_url(self): self.hrp.html_file |should| equal_to(self.v.url) def it_opens_and_cleans_html_file(self): self.hrp.open_html_file(self.v.url) |should| equal_to(self.v.raw_text_and_space_spec) def it_splits_html_character_entities(self): self.hrp.split_html_entities(self.v.raw_text_and_space_spec) \ |should| equal_to(self.v.raw_text_free_from_html_entity_spec) def it_lists_raw_html_file(self): self.hrp.list_raw_html_file(self.v.raw_text_free_from_html_entity_spec, self.v.concise_stopwords) \ |should| equal_to((self.v.text_no_punct_list_spec, self.v.concise_stopwords_list)) def it_splits_stopwords_from_html(self): self.hrp.split_stopwords(self.v.text_no_punct_list_spec, self.v.concise_stopwords_list) \ |should| equal_to(self.v.text_alpha_no_punct_stopword_list_spec) def it_lists_lemmatized_verb_noun_and_adjective(self): self.hrp.lemmatize_text_as_list(self.v.text_alpha_no_punct_stopword_list_spec) \ |should| equal_to(self.v.lemmatized_html_list) def it_creates_temporary_directory(self): self.hrp.create_temp_directory[1] |should| equal_to(True) def it_creates_wordtypes_from_html_lemmatized_list_of_words(self): self.hrp.list_wordtypes_from_html_list(self.v.lemmatized_html_list, self.v.n_cat) \ |should| equal_to(self.v.wordtypes_html_list) def it_creates_temporary_files_named_by_wordtypes_inside_temporary_directory(self): self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[1] \ |should| equal_to(False) def it_assigns_temporary_file_content_by_wordtype_to_a_list(self): self.hrp.assign_temp_files_html_content(self.v.wordtypes_html_list, self.v.lemmatized_html_list)[1] \ |should| equal_to(True) def it_assigns_temporary_content_to_temporary_files(self): obj = self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[0] self.hrp.assign_temp_content_to_temp_files(self.v.categories_html_content, obj)[1] \ |should| equal_to(True) def it_creates_categorized_plaintextcorpusreader(self): self.hrp.create_categorized_corpus(self.hrp.create_temp_directory[0])[1] \ |should| equal_to(True) def it_deletes_all_wordtypes_temporary_files(self): obj = self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[0] self.hrp.delete_temporary_files(obj) |should| equal_to(True) def it_checks_temporary_directory_was_removed_from_tmp_folder(self): categories_directory = self.hrp.create_temp_directory[0] tmp_folderid = self.hrp.create_temp_directory[3] self.hrp.remove_categories_directory(categories_directory, self.v.temporary_directory, tmp_folderid) \ |should| equal_to(False) # True
def setUp(self): self.v = Variables() self.hrp = HtmlRequirementProcessor(self.v.url, self.v.n_cat)
def html_analyzer(html_file, number_of_cat): hrp = HtmlRequirementProcessor(html_file, number_of_cat) raw_text_and_space = hrp.open_html_file(html_file) raw_text_free_from_html_entity = hrp.split_html_entities(raw_text_and_space) text_no_punct_list, \ stopwords_list = hrp.list_raw_html_file(raw_text_free_from_html_entity, STOPWORDS) text_alpha_no_punct_stopword_list = hrp.split_stopwords(text_no_punct_list, stopwords_list) lemmatized_list_by_verb_noun_adj_adv = \ hrp.lemmatize_text_as_list(text_alpha_no_punct_stopword_list) categories_directory, \ boolean_for_directory_test, \ tmp_root, tmp_folderid = hrp.create_temp_directory wordtype_categories = \ hrp.list_wordtypes_from_html_list(lemmatized_list_by_verb_noun_adj_adv, number_of_cat) category_tmp_file_list, \ boolean_for_file_test = \ hrp.create_temp_files_named_by_wordtypes(wordtype_categories, categories_directory) category_tmp_file_content, \ boolean_for_content_test = \ hrp.assign_temp_files_html_content(wordtype_categories, lemmatized_list_by_verb_noun_adj_adv) category_tmp_file_list, \ boolean_for_check_content_file_test = \ hrp.assign_temp_content_to_temp_files(category_tmp_file_content, category_tmp_file_list) reader, \ boolean_for_categories_test = hrp.create_categorized_corpus(categories_directory) hrp.tabulate_categorized_words(reader, number_of_cat) hrp.plot_html_results(lemmatized_list_by_verb_noun_adj_adv, number_of_cat) boolean_for_file_test = hrp.delete_temporary_files(category_tmp_file_list) boolean_for_categories_test = hrp.remove_categories_directory(categories_directory, tmp_root, tmp_folderid)