Exemplo n.º 1
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)
    
    stdout_old = sys.stdout
    
    sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w')  
    
    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    
    sys.stdout.flush()
    
    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)
    
    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)
    
    sys.stdout = stdout_old
Exemplo n.º 2
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old
Exemplo n.º 3
0
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        reader = TaggedCorpusReader(corpusroot, corpusname)
   
        self.reader_train = reader.tagged_sents()
        self.test_sent = reader.tagged_sents()[1000:] 
Exemplo n.º 4
0
 def get_brill_tagger(self):
     train_data = TaggedCorpusReader('.',
                                     'tagged_input_sentences.txt',
                                     sep="/")
     traindata = list(train_data.tagged_sents())
     postag = load('taggers/maxent_treebank_pos_tagger/english.pickle')
     templates = [
         brill.Template(brill.Pos([-1])),
         brill.Template(brill.Pos([1])),
         brill.Template(brill.Pos([-2])),
         brill.Template(brill.Pos([2])),
         brill.Template(brill.Pos([-2, -1])),
         brill.Template(brill.Pos([1, 2])),
         brill.Template(brill.Pos([-3, -2, -1])),
         brill.Template(brill.Pos([1, 2, 3])),
         brill.Template(brill.Pos([-1]), brill.Pos([1])),
         brill.Template(brill.Word([-1])),
         brill.Template(brill.Word([1])),
         brill.Template(brill.Word([-2])),
         brill.Template(brill.Word([2])),
         brill.Template(brill.Word([-2, -1])),
         brill.Template(brill.Word([1, 2])),
         brill.Template(brill.Word([-3, -2, -1])),
         brill.Template(brill.Word([1, 2, 3])),
         brill.Template(brill.Word([-1]), brill.Word([1]))
     ]
     trainer = BrillTaggerTrainer(postag, templates=templates, trace=3)
     brill_tagger = trainer.train(traindata, max_rules=10)
     return brill_tagger
Exemplo n.º 5
0
def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(
        model_type,
        time.time() - now, path))
Exemplo n.º 6
0
def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
Exemplo n.º 8
0
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        #nltk_old = [(3,0,1)]
        #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])]

        reader = TaggedCorpusReader(corpusroot, corpusname)

        splitratio = 0.8
   
        self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)]
        self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] 

        print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n"
        print "reader_train len: ", len(self.reader_train)
        print "test_sent len: ", len(self.test_sent)
Exemplo n.º 9
0
    def __init__(self, corpusroot, corpusname):
        #gunakan custom wordlist corpus dgn method WordListCorpusReader
        #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt'])
        #gunakan custom wordlist corpus dgn method PlaintextCorpusReader
        #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt')

        #nltk_old = [(3,0,1)]
        #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])]

        reader = TaggedCorpusReader(corpusroot, corpusname)

        splitratio = 0.8
   
        self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)]
        self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] 

        print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n"
        print "reader_train len: ", len(self.reader_train)
        print "test_sent len: ", len(self.test_sent)
Exemplo n.º 10
0
    def load_corpus_reviews(self,begin,end):
        #reader = LazyCorpusLoader()
        reader = TaggedCorpusReader('data/', r'.*\.pos')

        pos_fileids = reader.fileids()[1]
        neg_fileids = reader.fileids()[0]

        pos_sents = reader.tagged_sents(pos_fileids)
        neg_sents = reader.tagged_sents(neg_fileids)

        return (pos_sents[begin:end], neg_sents[begin:end])
Exemplo n.º 11
0
 def read(self, file_path):
     logger.info('Reading instances from file %s', file_path)
     reader = TaggedCorpusReader(*os.path.split(file_path),
                                 sep='\t',
                                 word_tokenizer=RegexpTokenizer(r'\n',
                                                                gaps=True),
                                 sent_tokenizer=BlanklineTokenizer(),
                                 para_block_reader=lambda s: [s.read()])
     return Dataset([
         self.text_to_instance(*tuple(zip(*tagged_sent)))
         for tagged_sent in reader.tagged_sents()
     ])
Exemplo n.º 12
0
def make_morpho_model(language,
                      model_type,
                      feature,
                      train_file,
                      test_file=None):
    test_file = train_file if test_file == None else test_file

    reader_train = TaggedCorpusReader('.', train_file)
    reader_test = TaggedCorpusReader('.', test_file)
    train_sents = reader_train.tagged_sents()
    test_sents = reader_test.tagged_sents()

    verify_tagged_corpus(reader_train)
    verify_tagged_corpus(reader_test)

    tagger = train_tagger(language, model_type, feature, train_sents)

    acc = tagger.evaluate(test_sents)
    baseline = compute_baseline(reader_test.tagged_words())
    kappa = (acc - baseline) / (1 - baseline)

    cm = conf_matrix(tagger, reader_test.words(), reader_test.tagged_words())

    return (tagger, acc, kappa, cm)
Exemplo n.º 13
0
def read_sentences_corpus(reader = None):
	#reader = LazyCorpusLoader()
	#its overriding reader
	reader = TaggedCorpusReader('../data/', r'.*\.pos')
	'''
	create a corpus reader with the files in ../data/*.pos 
	this files contains sentences tagged, and are the bases of trainig, test sets. 
	'''

	pos_fileids = reader.fileids()[1]
	neg_fileids = reader.fileids()[0]

	pos_sents = reader.tagged_sents(pos_fileids)
	neg_sents = reader.tagged_sents(neg_fileids)

	#pos_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in pos_sents ]
	#neg_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in neg_sents ]

	return (pos_sents,neg_sents)
Exemplo n.º 14
0
class CorpusParser:
    def __init__(self, root, fileids='.*', encoding='utf8'):
        """
        Reads all the files in root.

        :param root: Directory.
        :param fileids: List of files that have to be read. '.*' if all files have to be parsed.
        :param encoding: File enconding
        """
        self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)

    def words(self):
        """
        Returns all the words in the corpora.

        :return: List of words.
        """
        return self._reader.words()

    def tagged_words(self):
        """
        Returns all words of the corpora with their corresponding tag.

        :return: List of tuples (word, tag)
        """
        return self._reader.tagged_words()

    def sentences(self):
        """
        Returns a list of all sentences.

        :return: List of lists of words. Each list represents a sentence, with a list of its words in it.
        """
        return self._reader.sents()

    def tagged_sentences(self):
        """
        Returns a list of all sentences with the tag of each word.

        :return: List of lists of tuples. Each sentence is a list with all its members being tuples (word, tag).
        """
        return self._reader.tagged_sents()
Exemplo n.º 15
0
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
Exemplo n.º 16
0
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
Exemplo n.º 17
0
# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.',
                          r'.*\.sdx',
                          sep='|',
                          sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

## print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)

# Training set
training_data = tagged_data_list[:cutoff]
 def setUp(self):
     reader = TaggedCorpusReader('./corpora/oe', 'oe_train.pos')
     os.system('mkdir -p taggers/oe/pos')
     self.sents = reader.tagged_sents()
Exemplo n.º 19
0
def split_10fold(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i + n]

    # a list of 10 lists
    ten_parts = list(chunks(
        pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = [
            item.rstrip() for item in ten_parts[counter] if len(item) > 0
        ]  # or: test_set = part

        if counter == 1:
            print(len(test_set[993]), len(test_set[994]), len(test_set[995]),
                  len(test_set[996]))

        # filter out this loop's test index
        training_set_lists = [
            x for x in ten_parts if x is not ten_parts[counter]
        ]

        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [
            item.rstrip() for sublist in training_set_lists for item in sublist
            if len(item) > 0
        ]

        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test_%d.pos' % counter)
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
        test_sents = test_reader.tagged_sents()

        test_sents_tex = []
        for test_sent in test_sents:
            test_sents_tex.append(' '.join([token
                                            for token, tag in test_sent]))
        test_text_path = os.path.join(local_dir, 'test_%d.txt' % counter)
        with open(test_text_path, 'w') as f:
            f.write('\n'.join(test_sents_tex))

        test_path = os.path.join(local_dir, 'test_%d.pos' % counter)
        with open(test_path, 'w') as f:
            f.write('\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train_%d.pos' % counter)
        with open(train_path, 'w') as f:
            f.write('\n'.join(training_set))
Exemplo n.º 20
0
		storedModel = "/var/log/Terminology/pos_model_tnt.bin"
	else:
		storedModel = "/var/log/Terminology/pos_model_brill.bin"

	if os.path.isfile(storedModel):
		Service.logger.debug("Loading stored POS tagger model from %s" % storedModel)
		modelFile = open(storedModel, "rb")
		try:
			pos_tagger = cPickle.load(modelFile)
		except Exception, e:
			Servide.logger.debug("Exception while loading pickled POS model!")
			Service.logger.debug(Service.traceback.format_exc())
		modelFile.close()
	else:
		autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8')
		train_sents =  autodesk.tagged_sents() + treebank.tagged_sents()
	
		# Use TnT tagger on request
		if useTnTTagger:
			if __debug_on__:
				Service.logger.debug("Using TnT POS tagger...")
			unk_tagger = DefaultTagger('NN')
	
			pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True)
			pos_tagger.train(train_sents)
		# Use Brill tagger by default
		else:
			if __debug_on__:
				Service.logger.debug("Using Brill POS tagger...")
	
			def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
Exemplo n.º 21
0
def trainPOSTagger(useTnTTagger):
	global __debug_on__
	global pos_tagger
	global adskCorpusRoot
	# Train TNT/Brill POS-tagger using own training data + treebank data from nltk. Tested that using treebank data improves results.

	autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8')
	train_sents =  autodesk.tagged_sents() + treebank.tagged_sents()

	# Use TnT tagger on request
	if useTnTTagger:
		if __debug_on__:
			Service.logger.debug("Using TnT POS tagger...")
		unk_tagger = DefaultTagger('NN')

		pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True)
		pos_tagger.train(train_sents)
	# Use Brill tagger by default
	else:
		if __debug_on__:
			Service.logger.debug("Using Brill POS tagger...")

		def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
			if not backoff:
				backoff = tagger_classes[0](tagged_sents)
				del tagger_classes[0]
 
			for cls in tagger_classes:
				tagger = cls(tagged_sents, backoff=backoff)
				backoff = tagger
 
			return backoff
	
		word_patterns = [
			(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
			(r'.*ould$', 'MD'),
			(r'.*ing$', 'VBG'),
			(r'.*ed$', 'VBD'),
			(r'.*ness$', 'NN'),
			(r'.*ment$', 'NN'),
			(r'.*ful$', 'JJ'),
			(r'.*ious$', 'JJ'),
			(r'.*ble$', 'JJ'),
			(r'.*ic$', 'JJ'),
			(r'.*ive$', 'JJ'),
			(r'.*ic$', 'JJ'),
			(r'.*est$', 'JJ'),
			(r'^a$', 'PREP'),
		]
		raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.RegexpTagger(word_patterns))
 
		templates = [
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
			brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
			brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
			brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
		]
	 
		trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
		pos_tagger = trainer.train(train_sents, max_rules=200, min_score=3)
Exemplo n.º 22
0
class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot

    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__,
                                                 '.*\.txt',
                                                 sep='#')

    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile(',|。')
        return grammer.split(parag)

    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val

    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(
                        list(subtree))  # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)

    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features

    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)

    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non testing data
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)
Exemplo n.º 23
0
# tagged_sentences = nltk.corpus.brown.tagged_sents()
from nltk.corpus.reader import TaggedCorpusReader
reader = TaggedCorpusReader('/Users/lucasrosenblatt/nltk_data/corpora/oldenglish', 'taggedOEnpnounsDone.pos')
tagged_sentences = reader.tagged_sents()
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
import pprint 
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
Exemplo n.º 24
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
Exemplo n.º 25
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
Exemplo n.º 26
0
# # Brill Tagger #

# In[11]:

from nltk.wsd import lesk
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
import tkinter
from nltk.tag import brill, brill_trainer
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.data import load
from nltk.corpus.reader import TaggedCorpusReader


train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/")
traindata= list(train_data.tagged_sents())
postag= load('taggers/maxent_treebank_pos_tagger/english.pickle')

    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
Exemplo n.º 27
0
########## TAGGED CORPUS READER ###############

from nltk.corpus.reader import TaggedCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
file="brown.pos"
source=root+file

#Using Regex to match all files with extension .pos
reader=TaggedCorpusReader(root,r'.*\.pos')

print reader.words()
print reader.tagged_words()
print reader.sents()
print reader.tagged_sents()
print reader.paras()
print reader.tagged_paras()


#TaggedCorpus uses default tokenizer but we can change it by customizing it
from nltk.tokenize import SpaceTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer())
print reader.words()

#Customing TaggedCorpus's sentence tokenizer
from nltk.tokenize import LineTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer())
print reader.words()

#Customizing TaggedCorpus's paragraph Block reader
#Customizing TaggedCorpus's tag separator - Pg 57
Exemplo n.º 28
0
import sys
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import LineTokenizer

filename = sys.argv[1]
without_extension = filename.split('.')
file_address = filename.split('/')
directory = file_address[:-1]
directory_address = '/'.join('{}'.format(x) for x in directory) + '/'
corpus_reader = TaggedCorpusReader(directory_address, [filename],
                                   sent_tokenizer=LineTokenizer(),
                                   sep='|')
corpus = corpus_reader.tagged_sents()
new_tags_only = open(
    without_extension[0] + '_tag_sets.' + without_extension[1], 'a+')
count = 1
for each in corpus:
    new_tags_only.write(' '.join('{}'.format(x[1]) for x in each))
    new_tags_only.write('\n')
    print(count)
    count += 1
print(without_extension[1] + "Tag extracting finished")
new_tags_only.close()
Exemplo n.º 29
0
class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot
        
    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#')
    
    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile(',|。')
        return grammer.split(parag)
        
    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val
    
    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree)) # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)
    
    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features
        
    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)
    
    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non testing data            
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)
Exemplo n.º 30
0
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import PunktWordTokenizer
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.tokenize.regexp import WhitespaceTokenizer
global corpus, sent_tags, tagger

# corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux
corpus = TaggedCorpusReader(
    'C:/Users/jose.adail/workspace/TextProcessor/names',
    r'.*\.txt',
    word_tokenizer=WhitespaceTokenizer(),
    sep="_")
name_tags = corpus.tagged_sents(
)  # Recebe as sentenças marcadas com POS_Tags.
tagger = UnigramTagger(
    name_tags
)  # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas.


class RegexpReplacer(object):
    def __init__(self):
        self.replacement_patterns = [(r"'", ''), (r'#', 'hash'),
                                     (r'no', 'no_'), (r'not', 'not_'),
                                     (r'RT ', ''), (r'rs[rs]+', 'rs'),
                                     (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'),
                                     (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
                                     (r'e[e]+', 'eqe'), (r'o[o]+', 'oqo'),
                                     (r'tt', 'tqt'), (r'ff', 'fqf'),
                                     (r'dd', 'dqd'), (r'mm', 'mqm'),
Exemplo n.º 31
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make crf tagger
        crf_tagger = CRFTagger()
        crf_tagger.train(train_sents, 'model.crf.tagger')
        
        # evaluate crf tagger
        crf_accuracy = None
        crf_accuracy = crf_tagger.evaluate(test_sents)
        crf_accuracies.append(crf_accuracy)
        print('crf:', crf_accuracy)

        #if counter> 0: break
        
    final_accuracies_list = []
    mean_accuracy_crf = mean(crf_accuracies)
    standard_deviation_crf = stdev(crf_accuracies)
    uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}}
    final_accuracies_list.append(uni)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
Exemplo n.º 32
0
Arquivo: tagger.py Projeto: ixtel/wpfc
import nltk
from nltk.tag import RegexpTagger
from nltk.corpus.reader import TaggedCorpusReader

reader = TaggedCorpusReader('corpus','tagged_corpus')
train = reader.tagged_sents()

tagger0 = nltk.DefaultTagger('n')
tagger1 = nltk.UnigramTagger(train,backoff=tagger0)
tagger2 = nltk.BigramTagger(train,backoff=tagger1)
patterns = [
    (r'^\d+((.|,)\d+)?\.?$', 'NC'),
    (r'^.*\$$','$'),
    (r'R\$\d+((.|,)\d+)?\.?$','NC$'),
    (r'^(R|r)eais$','$'),
    (r'^(D|d)(o|ó)lares','$')
]
tagger3 = RegexpTagger(patterns,backoff=tagger2)

def tag(sent):
    result = tagger3.tag(sent.split())

    return result
Exemplo n.º 33
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
Exemplo n.º 34
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
Exemplo n.º 35
0

# Brill tagger parameters
max_rules=300
min_score=3

# Training parameters
development_size=5110
train=.85


# Read data from development.sdx
data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()


# Lower words and return as a list
tagged_data_list  = [[t for t in sent] for sent in tagged_data] 
tagged_data_list = [[(w.lower(),t) for (w,t) in s] for s in tagged_data_list]

## print "Data is read! " 

# Randomize training and evaluation set
random.seed(len(tagged_data_list)) 
random.shuffle(tagged_data_list) 
cutoff = int(development_size*train)

# Training set
training_data = tagged_data_list[:cutoff] 
Exemplo n.º 36
0
def split_10fold(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = [item.rstrip() for item in ten_parts[counter] if len(item) > 0]  # or: test_set = part
        
        if counter==1:
            print(len(test_set[993]),len(test_set[994]),len(test_set[995]),len(test_set[996]))
    
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item.rstrip() for sublist in training_set_lists for item in sublist if len(item) > 0]
        
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test_%d.pos'%counter)
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))
        
        test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
        test_sents = test_reader.tagged_sents()
        
        test_sents_tex = []
        for test_sent in test_sents:
            test_sents_tex.append(' '.join([token for token,tag in test_sent]))
        test_text_path = os.path.join(local_dir, 'test_%d.txt'%counter)
        with open(test_text_path, 'w') as f:
            f.write('\n'.join(test_sents_tex))
        
        test_path = os.path.join(local_dir, 'test_%d.pos'%counter)
        with open(test_path, 'w') as f:
            f.write('\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train_%d.pos'%counter)
        with open(train_path, 'w') as f:
            f.write('\n'.join(training_set))
Exemplo n.º 37
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.probability import FreqDist
from numpy import mean
# for kfold validation, not working though
# cross-fold validation is just brute forced...
#from sklearn.model_selection import KFold
#import numpy as np


mypath = "C:/Users/Lauren Shin/Documents/LING 111/.final project"

EstonianCorpus = TaggedCorpusReader(mypath, "estonianCaps.txt", encoding = "latin-1")

sentences = EstonianCorpus.tagged_sents()

tags = [tag for _, tag in EstonianCorpus.tagged_words()]
mostFrequent = FreqDist(tags).max()

default = DefaultTagger(mostFrequent)

# cross validation

#kf = KFold(n_splits = 3)
#
## turns the data into a 2d array
#X = np.array(sentences)
## creates a 1d array with same length/number of rows as X
#y = np.arange(0, len(sentences), 1)
#
Exemplo n.º 38
0
def NER_HINDI():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents)
    test = hmm_tagger.test(test_sents)
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = hmm_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
    #INPUT FROM FILE
    with open('HINDIHMMNER1.dill', 'wb') as f:
        dill.dump(hmm_tagger, f)
    with open('HINDIHMMNER1.dill', 'rb') as f:
        hmm_tagger1 = dill.load(f)

    test_tags = [
        tag for sent in reader.sents() for (word, tag) in hmm_tagger1.tag(sent)
    ]
    gold_tags = [tag for (word, tag) in reader.tagged_words()]
    ltesttag = len(test_tags)
    lgtags = len(gold_tags)
    print "Test Tag Len:", ltesttag
    print "Gold Tag Len:", lgtags
    cm = nltk.ConfusionMatrix(gold_tags, test_tags)
    print(cm.pretty_format(sort_by_count=True, show_percents=False,
                           truncate=5))
    labels = set('NA GPE PERS DATE  ORG'.split()
                 )  #THE TAG SETS AS GENERATED IN CONFUSION MATRIX
    true_positives = Counter()
    false_negatives = Counter()
    false_positives = Counter()
    for i in labels:
        for j in labels:
            if i == j:
                true_positives[i] += cm[i, j]
            else:
                false_negatives[i] += cm[i, j]
                false_positives[j] += cm[i, j]
    print "TP:", sum(true_positives.values()), true_positives
    print "FN:", sum(false_negatives.values()), false_negatives
    print "FP:", sum(false_positives.values()), false_positives
    print

    for i in sorted(labels):
        if true_positives[i] == 0:
            fscore = 0
        else:
            precision = true_positives[i] / float(true_positives[i] +
                                                  false_positives[i])
            recall = true_positives[i] / float(true_positives[i] +
                                               false_negatives[i])
            fscore = 2 * (precision * recall) / float(precision + recall)
            fscore1 = fscore * 100
            print "TAG:", i, "FMEASURE:", fscore1