Python XMLCorpusReader.all_sents示例

编程语言: Python
命名空间/包名称: htmlreaderALL
类/类型: XMLCorpusReader
方法/功能: all_sents
hotexamples.com的示例: 1
Python XMLCorpusReader.all_sents - 已找到1个示例。这些是从开源项目中提取的最受好评的htmlreaderALL.XMLCorpusReader.all_sents现实Python示例。您可以评价示例，以帮助我们提高示例质量。
常用方法
显示隐藏
all_sents(1)
all_tagging_sents(1)
示例#1
显示文件
文件： bambara_tagging_htmlreaderALL.py 项目： Batene/Bamanankan
class BambaraTagging(object):
    def __init__(self, root, file_list, option_tone, option_tag):
        self.root = root #only the pathname after "C:Users/<username>/nltk_data/corpora"; example: 'cookbook\\bambara' (instead of: C:/Users/<username>/nltk_data/corpora/cookbook/bambara)
        self.file_list = file_list
        self.option_tone = option_tone
        self.option_tag = option_tag
        self.reader = None
        self.anzahl_sents = 0
        self.train_sents = []
        self.test_sents = [] #=devset!
        self.unigramtagger = None
        self.bigramtagger = None
        self.trigramtagger = None
        self.contingenzliste = ["Getagged   :   Ursprüngliches Tag"+"\n"]
        self.reference_tags = []
        self.test_tags = []
        self.evaluate = 0
        self.evaluate_final = 0
        self.user = getpass.getuser()
        self.testset=[] #real test set
     
    def copy_files(self):
        """ Copies the corpus files (self.file_list) to the
        C:/Users/<username>/nltk_data/corpora/cookbook/bambara for further usage.
        If this directory does not exist yet, it will be created also.
        """
        print("Checking corpus directory...")
        if not os.path.exists("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara"):
            print("Creating corpus directories...")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\Corpus")
        else:
            print("nltk_data/corpora/cookbook/bambara folder exists")
        print("Checking corpus files...")
        for file in self.file_list:
            if not os.path.exists(file):
                print(file, "This corpus file does not exist")
                raise IOError('File does not exist:', file)
            else:
                if not os.path.exists("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\"+file):
                    print("Corpus file ", file, "does not exist yet.")
                    print("Copying file to nltk_data/corpora/cookbook/bambara. Please wait. This may take a while.")
                    # read the desired corpus files (given in self.file_list) from the folder
                    f = codecs.open(file, "r+", "utf-8")
                    # copies these corpus files in the right user directory
                    g = codecs.open("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\"+file, "w", "utf-8")
                    lines = f.readlines()
                    output = []
                    for i in lines:
                        output.append(i)
                    g.writelines(output)
                    g.close()
                    f.close()
                    print("Copied file: ", file)
                else:
                    print("Corpus file exists")     
    
    def create_reader(self):
        """Reads the corpus files with the XMLCorpusReader. See book for further explanations"""
        self.reader = XMLCorpusReader("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\", self.file_list, self.option_tone, self.option_tag)
        self.reader.all_tagging_sents()
        self.reader.all_sents()
        self.anzahl_sents = len(self.reader.tagged_sents)
        return self.reader, self.anzahl_sents
    
    def sets8_1_1(self, split):
        """
        Creation of train- and test-set. Furthermore, develoment set is created that serves to optimize the trained tagger.
        In 10 sentences, 1st-4th sentences and 6th-9th sentences go to the training set, while the 5th sentence goes to the test set
        and 10the sentence goes to the dev set.
        This partition of the corpus sentences is done in steps: <split> gives the numer of the bundles of sentences that are treated
        together.
        """
        n = (self.anzahl_sents//split) # gives the number of bundles consisting of split sentences
        saetze = self.reader.sents[:(n*split)]
        saetzetagged = self.reader.tagged_sents[:(n*split)]
        for i in range(n):
            s_split= saetze[:split]
            s_tag_split = saetzetagged[:split]
            for j in range(0, split, 10):
                #print(i, j)
                self.train_sents.append(s_tag_split[j]) # tagged sentences to train the tagger
                self.train_sents.append(s_tag_split[j+1])
                self.train_sents.append(s_tag_split[j+2])
                self.train_sents.append(s_tag_split[j+3])
                self.train_sents.append(s_tag_split[(j+5)])
                self.train_sents.append(s_tag_split[j+6])
                self.train_sents.append(s_tag_split[j+7])
                self.train_sents.append(s_tag_split[j+8])
                self.test_sents.append(s_tag_split[j+4]) # tagged sentences to test and optimize the tagger
                self.testset.append(s_tag_split[j+9]) # tagged sentences to check optimized tagger
            saetze = saetze[split:]
            saetzetagged = saetzetagged[split:]
        
    def sets9_1(self, split):
        """
        Creation of train- and test-set. Furthermore, development set is created that serves to optimize the trained tagger.
        In 10 sentences, 1st-4th sentences and 6th-9th sentences go to the training set, while the 5th sentence goes to the test set
        and 10the sentence goes to the dev set.
        This partition of the corpus sentences is done in steps: <split> gives the numer of the bundles of sentences that are treated
        together.
        """
        n = (self.anzahl_sents//split) # gives the number of bundles consisting of split sentences
        saetze = self.reader.sents[:(n*split)]
        saetzetagged = self.reader.tagged_sents[:(n*split)]
        for i in range(n):
            s_split= saetze[:split]
            s_tag_split = saetzetagged[:split]
            for j in range(0, split, 10):
                #print(i, j)
                self.train_sents+=s_tag_split[j:(j+9)] # tagged sentences to train the tagger
                self.test_sents.append(s_tag_split[j+9]) # tagged sentences to test and optimize the tagger
            saetze = saetze[split:]
            saetzetagged = saetzetagged[split:]
   

    def calculate_contingenz_with_sets(self, tagger):
        """
        Compares the original tags with the tags created by the tagger.
        """
        tagger_tagged = tagger.tag_sents([untag(i) for i in self.test_sents])
        tagger_words = sum(tagger_tagged,[])
        original_tagged = self.test_sents
        original_words = sum(original_tagged,[])
        tagged_org_zip = zip([i[1] for i in original_words],[i[1] for i in tagger_words])
        contingenzliste = []
        orig_tags = []
        tag_tags = []
        for i in tagged_org_zip:
            if i[0] != i[1]:
                if i[1] == None:
                    i = (i[0], "None")
                contingenzliste.append(i[1]+"   :   "+i[0]+"\n")
                orig_tags.append(i[0])
                tag_tags.append(i[1])   

        self.contingenzliste = self.contingenzliste + contingenzliste
        self.reference_tags = self.reference_tags + orig_tags
        self.test_tags = self.test_tags + tag_tags


    def matrix(self):
        """Creates a Contingenz Matrix using ConfusionMatrix of NLTK"""
        cm = ConfusionMatrix(self.reference_tags, self.test_tags) # first reference, then test!
        #f = codecs.open("C:\\Users\\"+self.user+"\\Downloads\\continenzmatrix.txt", "w", "utf-8")
        f = codecs.open("Results\\contingenzmatrix.txt","w","utf-8")
        f.write(cm.pp())
        f.close()
        ###print contingenzliste
        #g = codecs.open("C:\\Users\\"+self.user+"\\Downloads\\contingenzliste.txt", "w", "utf-8")
        #g.writelines(self.contingenzliste)
        #g.close()
        values_not_null = cm.get_values_not_null()
        return values_not_null