Exemplo n.º 1
0
    def analyze_text(self, book_folder, out_folder):
        filename = self.input_file.split('.')[0]
        result_book_folder = out_folder + filename + "/"
        if not os.path.exists(result_book_folder):
            os.makedirs(os.path.dirname(result_book_folder))

        novel = Novel(book_folder + self.input_file)
        novel.read()
        novel.parse_persons()
        novel.store(filename=result_book_folder + self.all_names,
                    data=novel.persons)
        # if you do not remove single occurrences, eps behaviour will be unstable
        occurrence_limit = 2
        novel.remove_less_than(occurrences=occurrence_limit)
        novel.store(filename=result_book_folder + filename +
                    "_names_more_than_" + str(occurrence_limit) + ".csv",
                    data=novel.persons)
        novel.cluster_aliases()
        novel.associate_single_names()
        novel.store(filename=result_book_folder + self.clusters,
                    data=novel.cluster_repetitions)
        novel.dealiases()
        novel.store(filename=result_book_folder + self.output_file,
                    data=novel.dealiased_text,
                    type='txt')
        self.novel = novel
Exemplo n.º 2
0
    def analyze_text(self, book_folder, out_folder):
        filename = self.input_file.split('.')[0]
        result_book_folder = out_folder + filename + "/"
        if not os.path.exists(result_book_folder):
            os.makedirs(os.path.dirname(result_book_folder))

        novel = Novel(book_folder + self.input_file)
        novel.read()
        novel.parse_persons()
        novel.find_persons_title()
        novel.store(filename=result_book_folder + self.all_names,
                    data=novel.persons)
        # if you do not remove single occurrences, eps behaviour will be unstable
        occurrence_limit = 2
        novel.remove_less_than(occurrences=occurrence_limit)
        novel.store(filename=result_book_folder + filename +
                    "_names_more_than_" + str(occurrence_limit) + ".csv",
                    data=novel.persons)
        novel.cluster_aliases()
        novel.associate_simple_single_names()
        novel.associate_single_names()
        novel.store(filename=result_book_folder + self.clusters,
                    data=novel.cluster_repetitions)
        novel.create_cluster_repetitions_df()
        novel.cluster_repetitions_df.to_pickle(result_book_folder + filename +
                                               '.pkl')
        novel.dealiases()
        novel.store(filename=result_book_folder + filename + "_dealiased.txt",
                    data=novel.dealiased_text,
                    type='txt')
        #Do the coreference after the dealias, because sometimes the coreference write a name just after a separation
        # and this lead to some not desired wrong situations in which name are together (e.g. "Potter,Hermione")
        novel.coreference()
        novel.store(filename=result_book_folder + self.output_file,
                    data=novel.dealiased_text,
                    type='txt')
        self.novel = novel
        return novel.cluster_repetitions_df