Python HtmlRequirementProcessor 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltkAnalyzer

hotexamples.com에서의 예제들: 3

Python HtmlRequirementProcessor - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltkAnalyzer.HtmlRequirementProcessor에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

assign_temp_content_to_temp_files(2)

assign_temp_files_html_content(2)

create_categorized_corpus(2)

create_temp_files_named_by_wordtypes(2)

delete_temporary_files(2)

lemmatize_text_as_list(2)

list_raw_html_file(2)

list_wordtypes_from_html_list(2)

open_html_file(2)

remove_categories_directory(2)

split_html_entities(2)

split_stopwords(2)

plot_html_results(1)

예제 #1

파일 보기

파일: nltkAnalyzer_spec.py 프로젝트: EduardoCarvalho/nltkAnalyzer

class TestHtmlRequirementProcessor(TestCase):

    def setUp(self):
        self.v = Variables()
        self.hrp = HtmlRequirementProcessor(self.v.url, self.v.n_cat)
    
    def it_receives_a_url(self):
        self.hrp.html_file |should| equal_to(self.v.url)
        
    def it_opens_and_cleans_html_file(self):
        self.hrp.open_html_file(self.v.url) |should| equal_to(self.v.raw_text_and_space_spec)
        
    def it_splits_html_character_entities(self):
        self.hrp.split_html_entities(self.v.raw_text_and_space_spec) \
        |should| equal_to(self.v.raw_text_free_from_html_entity_spec)
        
    def it_lists_raw_html_file(self):
        self.hrp.list_raw_html_file(self.v.raw_text_free_from_html_entity_spec, self.v.concise_stopwords) \
        |should| equal_to((self.v.text_no_punct_list_spec, self.v.concise_stopwords_list))
   
    def it_splits_stopwords_from_html(self):
        self.hrp.split_stopwords(self.v.text_no_punct_list_spec, self.v.concise_stopwords_list) \
        |should| equal_to(self.v.text_alpha_no_punct_stopword_list_spec)
        
    def it_lists_lemmatized_verb_noun_and_adjective(self):
        self.hrp.lemmatize_text_as_list(self.v.text_alpha_no_punct_stopword_list_spec) \
        |should| equal_to(self.v.lemmatized_html_list)
        
    def it_creates_temporary_directory(self):
        self.hrp.create_temp_directory[1] |should| equal_to(True)
        
    def it_creates_wordtypes_from_html_lemmatized_list_of_words(self):
        self.hrp.list_wordtypes_from_html_list(self.v.lemmatized_html_list, self.v.n_cat) \
        |should| equal_to(self.v.wordtypes_html_list) 
        
    def it_creates_temporary_files_named_by_wordtypes_inside_temporary_directory(self):
        self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[1] \
        |should| equal_to(False) 
    
    def it_assigns_temporary_file_content_by_wordtype_to_a_list(self):
        self.hrp.assign_temp_files_html_content(self.v.wordtypes_html_list, self.v.lemmatized_html_list)[1] \
        |should| equal_to(True)
        
    def it_assigns_temporary_content_to_temporary_files(self):
        obj = self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[0]
        self.hrp.assign_temp_content_to_temp_files(self.v.categories_html_content, obj)[1] \
        |should| equal_to(True)
        
    def it_creates_categorized_plaintextcorpusreader(self):
        self.hrp.create_categorized_corpus(self.hrp.create_temp_directory[0])[1] \
        |should| equal_to(True)
        
    def it_deletes_all_wordtypes_temporary_files(self):
        obj = self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[0]
        self.hrp.delete_temporary_files(obj) |should| equal_to(True)
    
    def it_checks_temporary_directory_was_removed_from_tmp_folder(self):
        categories_directory = self.hrp.create_temp_directory[0]
        tmp_folderid = self.hrp.create_temp_directory[3]
        self.hrp.remove_categories_directory(categories_directory, self.v.temporary_directory, tmp_folderid) \
        |should| equal_to(False) # True

예제 #2

파일 보기

파일: nltkAnalyzer_spec.py 프로젝트: EduardoCarvalho/nltkAnalyzer

 def setUp(self):
     self.v = Variables()
     self.hrp = HtmlRequirementProcessor(self.v.url, self.v.n_cat)

예제 #3

파일 보기

파일: runalyzer.py 프로젝트: EduardoCarvalho/nltkAnalyzer

def html_analyzer(html_file, number_of_cat):
    hrp = HtmlRequirementProcessor(html_file, number_of_cat)
    raw_text_and_space = hrp.open_html_file(html_file)
    raw_text_free_from_html_entity = hrp.split_html_entities(raw_text_and_space)
    text_no_punct_list, \
    stopwords_list = hrp.list_raw_html_file(raw_text_free_from_html_entity, 
                                            STOPWORDS)
    text_alpha_no_punct_stopword_list = hrp.split_stopwords(text_no_punct_list, 
                                                            stopwords_list)
    lemmatized_list_by_verb_noun_adj_adv = \
        hrp.lemmatize_text_as_list(text_alpha_no_punct_stopword_list)
    categories_directory, \
    boolean_for_directory_test, \
    tmp_root, tmp_folderid = hrp.create_temp_directory
    wordtype_categories = \
        hrp.list_wordtypes_from_html_list(lemmatized_list_by_verb_noun_adj_adv, 
                                          number_of_cat)
    category_tmp_file_list, \
    boolean_for_file_test = \
        hrp.create_temp_files_named_by_wordtypes(wordtype_categories, 
                                                 categories_directory)
    category_tmp_file_content, \
    boolean_for_content_test = \
        hrp.assign_temp_files_html_content(wordtype_categories, 
                                           lemmatized_list_by_verb_noun_adj_adv)
    category_tmp_file_list, \
    boolean_for_check_content_file_test = \
        hrp.assign_temp_content_to_temp_files(category_tmp_file_content, 
                                              category_tmp_file_list)
    reader, \
    boolean_for_categories_test = hrp.create_categorized_corpus(categories_directory)
    hrp.tabulate_categorized_words(reader, number_of_cat)
    hrp.plot_html_results(lemmatized_list_by_verb_noun_adj_adv, number_of_cat)
    boolean_for_file_test = hrp.delete_temporary_files(category_tmp_file_list)
    boolean_for_categories_test = hrp.remove_categories_directory(categories_directory, 
                                                                  tmp_root, 
                                                                  tmp_folderid)