def gen_featsets(self, train_sents, rare_word_cutoff):
        """
        Generates featuresets for each token in the training sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences.

        @type rare_word_cutoff: C{int}
        @param rare_word_cutoff: Words with less occurrences than
        C{rare_word_cutoff} will be treated differently by L{extract_feats}
        than non-rare words (cf. Ratnaparkhi 1996).

        @rtype: {list} of C{tuples} of (C{dict}, C{str})
        @return:  a list of tuples that contains the featureset of
        a token and its POS-tag.
        """
        featuresets = []
        for tagged_sent in train_sents:
            history = []
            untagged_sent = untag(tagged_sent)
            for (i, (_word, tag)) in enumerate(tagged_sent):
                featuresets.append( (self.extract_feats(untagged_sent, i,
                    history, rare_word_cutoff), tag) )
                history.append(tag)
        return featuresets
Exemplo n.º 2
0
    def gen_featsets(self, train_sents, rare_word_cutoff):
        """
        Generates featuresets for each token in the training sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences.

        @type rare_word_cutoff: C{int}
        @param rare_word_cutoff: Words with less occurrences than
        C{rare_word_cutoff} will be treated differently by L{extract_feats}
        than non-rare words (cf. Ratnaparkhi 1996).

        @rtype: {list} of C{tuples} of (C{dict}, C{str})
        @return:  a list of tuples that contains the featureset of
        a token and its POS-tag.
        """
        featuresets = []
        for tagged_sent in train_sents:
            history = []
            untagged_sent = untag(tagged_sent)
            for (i, (_word, tag)) in enumerate(tagged_sent):
                featuresets.append( (self.extract_feats(untagged_sent, i,
                    history, rare_word_cutoff), tag) )
                history.append(tag)
        return featuresets
def both_tags():
    """
    returns a dictionary, obj.  obj is 
    the graphson file dictionary.  This adds a new field, "tagged_text" 
    Which is a triple, the word, the NVD-tag, and the POS-tag
    """
    File = path_metasploit
    obj_text = codecs.open(File, 'r', encoding='utf-8').read()
    obj = json.loads(obj_text)
    #obj["vertices"] = [obj["vertices"][474]]
    #obj["edges"] = []

    for j in range(len(obj["vertices"])):
        print(j)
        V = obj["vertices"][j]
        if V["Metasploit-CVEid"] != '':
            t = V["Metasploit-Description"].split(' ')
            ID = V["Metasploit-CVEid"]
            T, keep = basic_tagger(t, ID)
            if keep == 1:
                T = secondary_tagger(T)
                # Returns first element in tagged sentence T
                S = nltk.untag(T)
                # Tag S with the part of speech of each element
                S = nltk.pos_tag(S)
                # Store tagged text as a triple: (word, NVD tag, POS tag)
                for i in range(len(T)):
                    T[i] = (T[i][0], T[i][1], S[i][1])
                obj["vertices"][j]["tagged_text"] = T
                if V["Metasploit-CVEid"] == '':
                    obj["vertices"][j]["tagged_text"] = ''
    print("Done")
    return obj
Exemplo n.º 4
0
 def test(self, testsents):
     num = 0 
     numsent = corsent = numword = corword = 0.0
     for row, sent in testsents.items():
         # print '#', row    
         untagged = untag(sent)
         history = self.viterbi(untagged)
         mistake = False
         numsent += 1
         for (i, (word, tag)) in enumerate(sent):
             # print word, ' tag: ', tag, ' tagged: ', history[i]
             numword += 1
             if tag == history[i]:
                 corword += 1
             else:
                 mistake = True
         if mistake == False:
             corsent += 1
                             
         num += 1
         if num > 20: break
     tokenacc =  (corword / numword) * 100
     tweetacc = (corsent / numsent) * 100
     print 'Token Acc : ', tokenacc
     print 'Sent Acc : ', tweetacc
     return tokenacc, tweetacc
Exemplo n.º 5
0
	def train2(self, trainset, iterations=10, a0=1, rare_word_cutoff=5, rare_feat_cutoff=5):
		
                self.gen_feats(trainset, rare_word_cutoff, rare_feat_cutoff)		
		self.M = len(self.featurenum)
		self.W = np.random.rand(self.M)
		W = self.W
                A = np.copy(W)
		for i in xrange(iterations):
			rate = a0 / (1 + sqrt(i))
			prevnorm = la.norm(W)	
			for (k, (row, sent)) in enumerate(trainset.items()):
                                # print 'Iter %d Sample %d' % ( i, k )
                                untagged = untag(sent)
				gold = [tag for w, tag in sent]
				predict = self.viterbi(untagged)
				if predict != gold:
					for j, x in enumerate(self.X[k]):
						# promote gold
						for featind in x['f'][gold[j]]:
							W[featind] += rate
						# demote predicted
						for featind in x['f'][predict[j]]:
							W[featind] -= rate
			curnorm = la.norm(W)
                        A = add(A, W)
			# print 'Train: iter ', i, ' prevnorm: ', prevnorm, ' curnorm: ', curnorm, ' del: ', abs(curnorm-prevnorm)
		self.W = np.copy(A / (iterations * len(self.W)))
def both_tags():
    """
    returns a dictionary, obj.  obj is 
    the graphson file dictionary.  This adds a new field, "tagged_text" 
    Which is a triple, the word, the NVD-tag, and the POS-tag
    """
    File=path_metasploit
    obj_text = codecs.open(File, 'r', encoding='utf-8').read()
    obj = json.loads(obj_text)
    #obj["vertices"] = [obj["vertices"][474]]
    #obj["edges"] = []

    for j in xrange(len(obj["vertices"])):
        print j
        V=obj["vertices"][j]
        if V["Metasploit-CVEid"] != '':
            t=V["Metasploit-Description"].split(' ')
            ID=V["Metasploit-CVEid"]
            T,keep=basic_tagger(t,ID)
            if keep == 1:
                T=secondary_tagger(T)
                # Returns first element in tagged sentence T
                S=nltk.untag(T)
                # Tag S with the part of speech of each element
                S=nltk.pos_tag(S)
                # Store tagged text as a triple: (word, NVD tag, POS tag)
                for i in range(len(T)):
                    T[i]=(T[i][0], T[i][1], S[i][1])
                obj["vertices"][j]["tagged_text"]=T
                if V["Metasploit-CVEid"] == '':
                    obj["vertices"][j]["tagged_text"]=''
    print "Done"
    return obj
Exemplo n.º 7
0
    def evaluate(self, gold):
            """
            Score the accuracy of the tagger against the gold standard.
            Strip the tags from the gold standard text, retag it using
            the tagger, then compute the accuracy score.

            :type gold: list(list(tuple(str, str)))
            :param gold: The list of tagged sentences to score the tagger on.
            :rtype: float
            """
            
            tagged_sents = self.tag_sents(untag(sent) for sent in gold)
            gold_tokens = sum(gold, [])
            test_tokens = sum(tagged_sents, [])

            correct_tags = dict()
            total_tags = dict()

            print test_tokens

            for i in range(0,len(test_tokens)):
                tag = test_tokens[i][1]
                if tag in total_tags:
                    total_tags[tag] += 1
                else: total_tags[tag] = 1

                if (gold_tokens[i][1] == test_tokens[i][1]):
                    tag = test_tokens[i][1]
                    if tag in correct_tags:
                        correct_tags[tag] += 1
                    else: correct_tags[tag] = 1

            for tag in correct_tags:
                print tag,':\t'+str(correct_tags[tag]/float(total_tags[tag]))
Exemplo n.º 8
0
def extract_relationship(extraction_file):
    characters, dialogues = order_phrases(extraction_file)
    print(characters)

    #with open("MultiNaiveBayesClassifier", "rb") as f:
    #	classifier = pickle.load(f)

    previous_speaker = None
    last_mentioned_entity = {"male": None, "female": None}
    relationship = []

    #result = open("result", "w")
    i = 1
    num_of_phrases = len(dialogues)
    for phrase in dialogues:
        speaker = phrase[0]
        entity = None
        attitude = None

        for sent in phrase[1]:
            word_tag = nltk.pos_tag(nltk.word_tokenize(sent))

            mother_tree = nltk.ne_chunk(word_tag)

            for tree in mother_tree:
                if hasattr(tree, 'label') and tree.label:
                    if tree.label() == 'PERSON':
                        entity_name = ' '.join([child[0] for child in tree])
                        #print(entity_name)

                        if entity_name.title() not in characters:
                            continue

                        last_mentioned_entity[characters[
                            entity_name.title()]] = entity_name.title()
                        entity = entity_name.title()

            for pair in word_tag:
                if pair[1] == "PRN":
                    if pair[0].lower() in ("he", "him"):
                        entity = last_mentioned_entity["male"]
                    elif pair[0].lower() in ("she", "her"):
                        entity = last_mentioned_entity["female"]

            #feat = sc.find_features(nltk.untag(word_tag))
            #attitude = classifier.classify(feat)
            color = sc.find_features(nltk.untag(word_tag))

            #result.write("Speaks: {}\nTo: {}\nAttitude: {}\nSaid: {}\n\n".format(speaker, entity, attitude, sent))
            #print("Speaks: {}\nTo: {}\nAttitude: {}\nSaid: {}\n\n".format(speaker, entity, attitude, sent))

        relationship.append((speaker, entity, color))

        print((i / num_of_phrases) * 100)

        i += 1

    return relationship
Exemplo n.º 9
0
def ConfusionMatrix(self, corpus_test):
    matrix = FreqDist()
    tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test])
    testTokens = sum(corpus_test,[]) # real tags from the corpus
    taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used
    for tagged, test in izip(taggerTokens, testTokens ):
        if tagged != test :
            matrix.inc((tagged[1],test[1]))
    return matrix
Exemplo n.º 10
0
def ConfusionMatrix(self, corpus_test):
    matrix = FreqDist()
    tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test])
    testTokens = sum(corpus_test, [])  # real tags from the corpus
    taggerTokens = sum(tagged_sents, [])  # tags of the tagger that in used
    for tagged, test in izip(taggerTokens, testTokens):
        if tagged != test:
            matrix.inc((tagged[1], test[1]))
    return matrix
Exemplo n.º 11
0
 def __init__(self, train_sents):  #[["","","",""],["","","",""]]
     train_sets = []
     for tagged_sent in train_sents:
         untagged_sent = nltk.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             train_sets.append((pos_feature_tag(untagged_sent, i,
                                                history), tag))
             history.append(tag)
     self.classifier = nltk.classify.NaiveBayesClassifier.train(train_sets)
Exemplo n.º 12
0
 def gen_featsets(self, train_sents, rare_word_cutoff):
     featuresets = []
     for tagged_sent in train_sents:
         history = []
         untagged_sent = untag(tagged_sent)
         for (i, (_word, tag)) in enumerate(tagged_sent):
             featuresets.append( (self.extract_feats(untagged_sent, i,
                 history, rare_word_cutoff), tag) )
             history.append(tag)
     return featuresets
Exemplo n.º 13
0
    def gen_feats(self, trainsents, rare_word_cutoff, rare_feat_cutoff=5):
        features = defaultdict(int)
        self.X = []

        self.word_freqdist = self.gen_word_freqdist(trainsents)
        for row, sent in trainsents.items():
            history = None
            untagged = untag(sent)
            # nltktags = pos_tag(untagged)
            # print nltktags
            for (i, (word, tag)) in enumerate(sent):
                x = dict()
                feature = self.extract_feat(untagged, i, history,
                                            rare_word_cutoff)
                # feature['nltk_tag'] = nltktags[i]
                x['features'] = feature
                x['tag'] = tag
                x['target_feat'] = self.phi(feature, tag)

                self.X.append(x)
                history = tag

                for f in x['target_feat']:
                    features[f] += 1

        self.featuresets = OrderedDict(
            sorted(features.items(), key=lambda t: t[1], reverse=True))

        #cutoff rare features
        # self.featuresets = OrderedDict( (key, val) for (key, val) in self.featuresets.iteritems() if val > rare_feat_cutoff)
        self.featurenum = dict()

        # for f, count in self.featuresets.items():
        # print f, count

        for (i, ((f, val), tag)) in enumerate(self.featuresets.iterkeys()):

            self.featurenum[((f, val), tag)] = i
            self.tags[tag] += 1

        # print self.tags

        for x in self.X:
            x['f'] = dict()
            for tag in self.tags.iterkeys():
                x['f'][tag] = self.getactivef(self.phi(x['features'], tag))

            x['target_feat'] = x['f'][x['tag']]
            # print x['features'], x['target_feat']

        print '#Features', len(self.featuresets)
Exemplo n.º 14
0
    def train(cls, train_sents, feature_extractor, classifier_cls, **kwargs):
        train_set = []

        for tagged_sent in train_sents:
            untagged_sent = untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                featureset = feature_extractor(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)

        classifier = classifier_cls.train(train_set, **kwargs)
        return cls(feature_extractor, classifier)
def both_tags():
    """
    returns a dictionary, obj, with a key for each year.  obj[year] is 
    the graphson file dictionary.  This adds a new field, "tagged_text", 
    which is a triple, the word, the NVD-tag, and the POS-tag
    """
    obj = {}
    files = os.listdir(path)
    #files = ['MSSecurityData90.graphson']

    for file_num in range(0, len(files)):
        obj_text = codecs.open(path + files[file_num], 'r',
                               encoding='utf-8').read()
        current_obj = json.loads(obj_text)
        obj[file_num] = current_obj

    for file_num in xrange(0, len(files)):
        print file_num
        print files[file_num]
        for j in xrange(len(obj[file_num]["vertices"])):
            obj[file_num]["vertices"][j]["tagged_text"] = []
            t = ""
            # Combine the descriptions in a given file
            for description in [
                    'MS-Description', 'MS-ExecutiveSummary',
                    'MS-ImpactDescription', 'MS-MitigationDescription',
                    'MS-TargetSetDescription', 'MS-Title',
                    'MS-WorkaroundDescription'
            ]:
                V = obj[file_num]["vertices"][j]
                if description in V.keys():
                    if description == 'MS-Title' or description == 'MS-ExecutiveSummary':
                        V[description] = [V[description]]
                    for i in range(0, len(V[description])):
                        t = t + ' ' + V[description][i]
            # Perform the tagging
            if V["_id"] != '':
                t = t.split(' ')
                ID = V["_id"]
                T, keep = basic_tagger(t, ID)
                if keep == 1:  # only keep it if it found a matching cpe vector
                    T = secondary_tagger(T)
                    S = nltk.untag(T)
                    S = nltk.pos_tag(S)
                    for i in range(len(T)):
                        T[i] = (T[i][0], T[i][1], S[i][1])
                    obj[file_num]["vertices"][j]["tagged_text"].extend(T)
        print "done with file ", file_num
    print "Done"
    return obj
Exemplo n.º 16
0
def MicroEvaluate(self,corpus_test):
    tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test])#tagger tagged
    testTokens = sum(corpus_test,[]) # real tags from the corpus
    taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used
    tags = [] #all possible tags------------------TODO
    for x in testTokens:
        w,t = x
        if not tags.__contains__(t):
            tags.append(t)
    fmeasure = 0
    for tag in tags:
        fmeasure += calcFMeasur(tag, testTokens, taggerTokens)
    if len(tags) == 0:
        return 0
    return fmeasure / len(tags)
Exemplo n.º 17
0
def MicroEvaluate(self, corpus_test):
    tagged_sents = self.batch_tag([nltk.untag(sent)
                                   for sent in corpus_test])  #tagger tagged
    testTokens = sum(corpus_test, [])  # real tags from the corpus
    taggerTokens = sum(tagged_sents, [])  # tags of the tagger that in used
    tags = []  #all possible tags------------------TODO
    for x in testTokens:
        w, t = x
        if not tags.__contains__(t):
            tags.append(t)
    fmeasure = 0
    for tag in tags:
        fmeasure += calcFMeasur(tag, testTokens, taggerTokens)
    if len(tags) == 0:
        return 0
    return fmeasure / len(tags)
Exemplo n.º 18
0
def getDifficultTags(tagger, testCorpus, x, tagsSet):
    difficultTags = []
    precs = []
    #defining which tags are we checking full or simplified tags if simplified -> getting the tagger and the testCorpus according to simplified tags set
    corpusTokens = sum(testCorpus, [])
    #calculating precision for each tag
    tagger_tags = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus])
    taggedTokens = sum(tagger_tags, [])
    for t in tagsSet:
        p = calcPrec(t, corpusTokens, taggedTokens)
        precs.append((t, p))
    #insert x lowest tags to difficultTags
    precs = sorted(precs, key=itemgetter(1))
    for w, p in precs:
        if len(difficultTags) < x:
            difficultTags.append(w)
    return difficultTags
Exemplo n.º 19
0
def getDifficultTags(tagger, testCorpus, x, tagsSet):
    difficultTags = []
    precs = []
    #defining which tags are we checking full or simplified tags if simplified -> getting the tagger and the testCorpus according to simplified tags set    
    corpusTokens = sum(testCorpus, [])
    #calculating precision for each tag
    tagger_tags = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus])
    taggedTokens = sum(tagger_tags, [])
    for t in tagsSet:
        p = calcPrec(t, corpusTokens, taggedTokens)
        precs.append((t,p))
    #insert x lowest tags to difficultTags
    precs = sorted(precs, key=itemgetter(1))
    for w,p in precs:
        if len(difficultTags) < x:
            difficultTags.append(w)     
    return difficultTags
def both_tags():
    """
    returns a dictionary, obj, with a key for each year.  obj[year] is 
    the graphson file dictionary.  This adds a new field, "tagged_text", 
    which is a triple, the word, the NVD-tag, and the POS-tag
    """
    obj = {}
    files = os.listdir(path)
    #files = ['MSSecurityData90.graphson']
    
    for file_num in range(0,len(files)):
        obj_text = codecs.open(path+files[file_num], 'r', encoding='utf-8').read()
        current_obj = json.loads(obj_text)
        obj[file_num] = current_obj

    for file_num in xrange(0,len(files)):
        print file_num
        print files[file_num]
        for j in xrange(len(obj[file_num]["vertices"])):
            obj[file_num]["vertices"][j]["tagged_text"] = []
            t = ""
            # Combine the descriptions in a given file
            for description in ['MS-Description','MS-ExecutiveSummary','MS-ImpactDescription','MS-MitigationDescription',
                                'MS-TargetSetDescription','MS-Title', 'MS-WorkaroundDescription']:
                V=obj[file_num]["vertices"][j]
                if description in V.keys():
                    if description == 'MS-Title' or description == 'MS-ExecutiveSummary':
                        V[description]= [V[description]]
                    for i in range(0,len(V[description])):
                        t = t + ' ' + V[description][i]
            # Perform the tagging         
            if V["_id"] != '':
                t=t.split(' ')
                ID=V["_id"]
                T,keep=basic_tagger(t,ID)
                if keep==1: # only keep it if it found a matching cpe vector
                    T=secondary_tagger(T)
                    S=nltk.untag(T)
                    S=nltk.pos_tag(S)
                    for i in range(len(T)):
                        T[i]=(T[i][0], T[i][1], S[i][1])
                    obj[file_num]["vertices"][j]["tagged_text"].extend(T)
        print "done with file ", file_num
    print "Done"
    return obj
Exemplo n.º 21
0
    def evaluate(self, level=2):
        """
        Evaluates the trained POSTagger model on test data - computes accuracy
        and frequency distribution of wrong predictions

        Argument:
        ----------
            level (int):
                Type of tagger to be returned - '0' corresponds to
        default tagger, '1' corresponds to a unigram tagger, '2'
        corresponds to a bigram tagger and '3' corresponds to a
        trigram tagger, with each of the previous levels as backoffs

        Returns:
        --------
            fd (nltk.FreqDist):
                Frequency Distribution of wrong predictions
        """
        sentences, tagged_sentences = self.data_preparation()
        partition = int(len(tagged_sentences) * self.partition_ratio)
        train_set = tagged_sentences[:partition]
        test_set = tagged_sentences[partition:]
        print(len(train_set), len(test_set))
        tagger = self.tagger(train_set, level)
        accuracy = tagger.evaluate(test_set)

        print(f'Accuracy is {accuracy}')
        predictions = [(word, tag) for sentence in test_set
                       for (word, tag) in tagger.tag(nltk.untag(sentence))]
        wrong_predictions = [
            (word, tag, actual)
            for ((word, tag),
                 (_, actual)) in zip(predictions, [(w, t)
                                                   for sentence in test_set
                                                   for (w, t) in sentence])
            if tag != actual and tag is not None
        ]
        fd = nltk.FreqDist(wrong_predictions)
        print('Performing analysis...')
        print('Frequency Distribution of wrong predictions...')
        return fd
Exemplo n.º 22
0
def both_tags():
	"""
	returns a dictionary, called "obj", with a key for each year.  obj[year] is 
	the graphson file dictionary.  This adds a new field, "tagged_text" 
	Which is a triple, the word, the NVD-tag, and the POS-tag
	"""
	File2010=path_nvd_2010	
	obj_text = codecs.open(File2010, 'r', encoding='utf-8').read()
	obj2010 = json.loads(obj_text)

	File2011=path_nvd_2011
	obj_text = codecs.open(File2011, 'r', encoding='utf-8').read()
	obj2011 = json.loads(obj_text)

	File2012=path_nvd_2012
	obj_text = codecs.open(File2012, 'r', encoding='utf-8').read()
	obj2012 = json.loads(obj_text)

	File2013=path_nvd_2013
	obj_text = codecs.open(File2013, 'r', encoding='utf-8').read()
	obj2013 = json.loads(obj_text)

	obj={2010:obj2010, 2011:obj2011, 2012:obj2012, 2013:obj2013}

	for year in xrange(2010,2014):
		print year
		for j in xrange(len(obj[year]["vertices"])):
			print j
			V=obj[year]["vertices"][j]
			t=V["description"].split(' ')
			ID=V["_id"]
			T=basic_tagger(t,ID)
			T=secondary_tagger(T)
			S=nltk.untag(T)
			S=nltk.pos_tag(S)
			for i in range(len(T)):
				T[i]=(T[i][0], T[i][1], S[i][1])
			obj[year]["vertices"][j]["tagged_text"]=T
		print "done with year ", year
	print "Done"
	return obj
Exemplo n.º 23
0
def both_tags():
    """
	returns a dictionary, called "obj", with a key for each year.  obj[year] is 
	the graphson file dictionary.  This adds a new field, "tagged_text" 
	Which is a triple, the word, the NVD-tag, and the POS-tag
	"""
    File2010 = path_nvd_2010
    obj_text = codecs.open(File2010, 'r', encoding='utf-8').read()
    obj2010 = json.loads(obj_text)

    File2011 = path_nvd_2011
    obj_text = codecs.open(File2011, 'r', encoding='utf-8').read()
    obj2011 = json.loads(obj_text)

    File2012 = path_nvd_2012
    obj_text = codecs.open(File2012, 'r', encoding='utf-8').read()
    obj2012 = json.loads(obj_text)

    File2013 = path_nvd_2013
    obj_text = codecs.open(File2013, 'r', encoding='utf-8').read()
    obj2013 = json.loads(obj_text)

    obj = {2010: obj2010, 2011: obj2011, 2012: obj2012, 2013: obj2013}

    for year in range(2010, 2014):
        print(year)
        for j in range(len(obj[year]["vertices"])):
            print(j)
            V = obj[year]["vertices"][j]
            t = V["description"].split(' ')
            ID = V["_id"]
            T = basic_tagger(t, ID)
            T = secondary_tagger(T)
            S = nltk.untag(T)
            S = nltk.pos_tag(S)
            for i in range(len(T)):
                T[i] = (T[i][0], T[i][1], S[i][1])
            obj[year]["vertices"][j]["tagged_text"] = T
        print("done with year ", year)
    print("Done")
    return obj
Exemplo n.º 24
0
def mark_entities(tagged_sentence, entity_words, label):
    """
    tagged_sentence: [('Word', 'Tag'), ...]
    entity_words: ['This', 'is', 'an', 'entity']
    label: the entity type

    return a nltk.Tree instance with the entities wrapped in chunks
    """

    iob_tagged = [(w, t, 'O') for w, t in tagged_sentence]

    words = nltk.untag(tagged_sentence)
    start_index = sub_list(words, entity_words)
    if start_index is not None:
        iob_tagged[start_index] = (iob_tagged[start_index][0],
                                   iob_tagged[start_index][1], 'B-' + label)
        for idx in range(1, len(entity_words)):
            iob_tagged[start_index + idx] = (iob_tagged[start_index + idx][0],
                                             iob_tagged[start_index + idx][1],
                                             'I-' + label)

    return nltk.conlltags2tree(iob_tagged)
Exemplo n.º 25
0
def replaceTextnumberWithNumber(text):
    tagged_number_words = 'ten/CD thousand/CD nine/CD hundred/CD ninety/CD eight/CD seven/CD six/CD five/CD four/CD three/CD two/CD one/CD eighty/CD seventy/CD sixty/CD fifty/CD forty/CD thirty/CD twenty/CD nineteen/CD eighteen/CD seventeen/CD sixteen/CD fifteen/CD fourteen/CD thirteen/CD twelve/CD eleven/CD zero/CD'
    tagged_number_words_tuples = [
        nltk.tag.str2tuple(t) for t in tagged_number_words.split()
    ]
    my_tagger = nltk.UnigramTagger([tagged_number_words_tuples],
                                   backoff=nltk.DefaultTagger('IGNORE'))

    my_grammar = 'NumberWord: {<CD>+}'
    parser = nltk.RegexpParser(my_grammar)
    parsed = parser.parse(my_tagger.tag(nltk.word_tokenize(text.lower())))

    for tag in [
            tree.leaves() for tree in parsed.subtrees()
            if tree.label() == 'NumberWord'
    ]:
        ut = nltk.untag(tag)
        num = w2n.word_to_num(' '.join(ut))

        r = re.compile(re.escape(' '.join(ut)), re.IGNORECASE)
        text = r.sub(str(num), text)

    return text
Exemplo n.º 26
0
    def test(self, testsents, clftype='argmax'):
        num = 0
        numsent = corsent = numword = corword = 0.0

        for row, sent in testsents.items():
            # print '#', row
            untagged = untag(sent)

            if clftype == 'naive':
                history = self.naive_tagsent(untagged)
            else:
                history = self.argmax(untagged)

            mistake = False

            numsent += 1
            for (i, (word, tag)) in enumerate(sent):
                # print word, ' tag: ', tag, ' tagged: ', history[i]
                numword += 1
                if tag == history[i]:
                    corword += 1
                else:
                    mistake = True

            if mistake == False:
                corsent += 1

            # num += 1
            # if num > 20: break

        tokenacc = (corword / numword) * 100
        tweetacc = (corsent / numsent) * 100
        print 'Token Acc : ', tokenacc
        print 'Tweet Acc : ', tweetacc

        return tokenacc, tweetacc
Exemplo n.º 27
0
def extract_candidate_answers(passage, answer_type, question, stop_words):
    '''
    Attempts to return a list of possible answers from the passage
    
    Answer types we need to support: PERSON, LOCATION, ORGANIZATION, NUMBER, DATE, FACT
    Answer types we currently support: PERSON, LOCATION, ORGANIZATION, NUMBER (very poorly), DATE (slightly less poorly)
    
    
    First attempt: For our first attempt we just use the nltk.ne_chunk to 
    crudely tag named entities. We then return all the entities equal to the answer type
    (Which is why we only support PERSON, LOCATION, ORGANIZATION)
    
    Second attempt: very similar to first attempt, but we include Number and Date. Numbers are found by
    using nltk to tag words since it has a tag for a number. We first however find relevenat snippets of the passage
    first. We do this as:
    foreach word in question that is not a stop_word:
        foreach occurrence of word in passage
            add [5 previous words, word, 5 next words] to snippet.
    '''
    answer_type = answer_type.upper()
    candidate_answers = []
    
    #Get tokens from question and remove those that are in the stop words
    question_tokens = nltk.word_tokenize(question)
    
    important_terms = [token for token in question_tokens if token.lower() not in stop_words]
    snippets = []
    
    for term in important_terms:
        snippet_regex = r'((?:\S+\s*){,5})(%s)((?:\s*\S+\s*){,5})' % term #Ganked this from some fool on Stack Overflow
        
        for snippet_match in re.finditer(snippet_regex, passage, re.IGNORECASE):
            (before, term, after) = snippet_match.group(1, 2, 3)
            snippets.append(before + term + after)
            
    
    ''' Problem: there could be duplicated snippets. For example if the query was 'Who was the oldest president'
    oldest and president would both be important_terms, so a phrase such as 'John Doe was the oldest president when he took office in 1994'
    would trigger snippets 'John Doe was the oldest president when he took office' and 'John Doe was the oldest president when he took office in'
    '''
    passage = " ".join(snippets)
    tokenized_passage = nltk.word_tokenize(passage)
    tagged_passage =nltk.pos_tag(tokenized_passage)
    
    if answer_type == "DATE":
        ''' Dates are pretty hard to extract because they occur in so many different manners. Common date forms:
            blah (1990 - 2000). 
            2010 (Four digit number)
            April 1st (Month followed by a day)
            1st of April
        '''
        
        month_map = {'Jan' : 'January', 'January' : 'January', 'Feb' : 'February', 'Febr' : 'February', 
                     'February' : 'February', 'March' : 'March', 'Marc' : 'March', 'Mar' : 'March',
                     'April' : 'April', 'Apr' : 'April', 'May' : 'May', 'June' : 'June', 'Jun' : 'June',
                     'July' : 'July', 'Jul' : 'July', 'August' : 'August', 'Aug' : 'August',
                      'Sep' : 'September', 'Sept' : 'September', 'September' : 'September',
                      'October' : 'October', 'Oct': 'October', 'November' : 'November', 'Nov' : 'November',
                      'Dec' : 'December', 'December' : 'December'}
        
        #Attempts to match a month or month abbreviation followed by a day number
        #followed by a year
        month_date_year_regex = r'(Jan|January|Feb|Febr|February|March|Marc|Mar|' + \
                           r'Apr|April|May|June|Jun|Jul|July|Aug|August|' + \
                           r'Sept|Sep|September|October|Oct|Nov|November|Dec|December)' + \
                           r'[,.]?\s*(\d{1,2})(st|nd|rd|th)?,?\s*(\d{4})?' \
                           
        year_regex = r'\D(\d{4})\D' #Match non-digit, then 4 digits, then a non-digit
        
        for month_date_year_match in re.finditer(month_date_year_regex, passage, re.IGNORECASE):
            (month, day, _, year) = month_date_year_match.group(1, 2, 3, 4)
            month = month_map[month]
            if day[0] == '0':
                day = day[1:]
            if day == '':
                continue
            
            date_list = [x for x in [month, day, year] if x is not None]
            
            candidate_answers.append(" ".join(date_list))
            
        for year in re.finditer(year_regex, passage):
            candidate_answers.append(year.group(1)) #Year is the group 1
        
    elif answer_type == 'FACT':
        pass
    elif answer_type == "NUMBER":
        grammar = r"""
        NUM : {<CD><NN.?>}
        {<CD>}
        """
        cp = nltk.RegexpParser(grammar)
        parse_tree = cp.parse(tagged_passage)
        for node in parse_tree:
            if hasattr(node, 'node'): #Must be a number
                candidate_answers.append(" ".join(nltk.untag(node[:])))
        
    
    else:
        ne_passage = nltk.ne_chunk(tagged_passage)
        
        # Right now we change an answer type of LOCATION to GPE
        # since nltk is f*****g weird
        if answer_type == 'LOCATION' : answer_type = 'GPE'
        
        for node in ne_passage:
            if hasattr(node, 'node'): #Is a named entity
                if node.node == answer_type:
                    candidate_answers.append(" ".join(nltk.untag(node[:])))
    
    return candidate_answers
Exemplo n.º 28
0

def noneCount(simpleTaggedSents):
    # calculate number of unknown words
    noneCount = 0
    # flatten test list
    flattenedSents = [
        item for sublist in simpleTaggedSents for item in sublist
    ]
    noneCount = sum(1 for (word, tag) in flattenedSents if tag == "None")
    return noneCount


print(
    "Unknown words: %d" %
    noneCount(simpleUnigramTagger.tag_sents(nltk.untag(sent)
                                            for sent in test)))

# 3.1.4. Report the rate of unknown words per category.

# In[19]:

print("Unknown words (by category):")
for c in brown.categories():
    brown_sents = brown.tagged_sents(categories=c, tagset='universal')
    train_ = brown_sents[100:]
    test_ = brown_sents[:100]
    simpTag = SimpleUnigramTagger(train_)
    simpleTaggedSents = simpTag.tag_sents(nltk.untag(sent) for sent in test_)
    print("%s: %d" % (c, noneCount(simpleTaggedSents)))
Exemplo n.º 29
0
 def __fix_tag_scheme(self, sentence):
     untagged = nltk.untag(sentence)
     new_tags = nltk.pos_tag(untagged)
     return new_tags
Exemplo n.º 30
0
def checkTaggerRecallForTag(tagger, tag, testCorpus):
    tagged_sents = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus])#tagger tagged
    testTokens = sum(testCorpus,[]) # real tags from the corpus
    taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used
    return calcRecall(tag, testTokens, taggerTokens)
Exemplo n.º 31
0
def checkTaggerRecallForTag(tagger, tag, testCorpus):
    tagged_sents = tagger.batch_tag([nltk.untag(sent)
                                     for sent in testCorpus])  #tagger tagged
    testTokens = sum(testCorpus, [])  # real tags from the corpus
    taggerTokens = sum(tagged_sents, [])  # tags of the tagger that in used
    return calcRecall(tag, testTokens, taggerTokens)
Exemplo n.º 32
0
corpus = SBCorpusReader(corpuspath)
tagged_sents = corpus.tagged_sents()
print(corpus.readme())
print("No. sentences:", len(tagged_sents))
print()

print("* Separating training and testing data (NLTK book, sec 5.2)")
breakpoint = int(len(tagged_sents) * 0.9)
train_sents = tagged_sents[:breakpoint]
test_sents = tagged_sents[breakpoint:]
print("No. train sentences:", len(train_sents))
print("No. test  sentences:", len(test_sents))
print()

seen_example = nltk.untag(tagged_sents[10])
unseen_example = nltk.untag(tagged_sents[-10])


def show_example(tagged_sent):
    return " ".join(map(nltk.tuple2str, tagged_sent))


start_time = time.process_time()
print("* Default tagger (NLTK book, sec 4.1)")
tags = [tag for sent in train_sents for (word, tag) in sent]
most_common_tag = nltk.FreqDist(tags).max()
print("Most common tag:", most_common_tag)
default_tagger = nltk.DefaultTagger(most_common_tag)
print("Seen:", show_example(default_tagger.tag(seen_example)))
print("Unseen:", show_example(default_tagger.tag(unseen_example)))
Exemplo n.º 33
0
def extract_candidate_answers(passage, answer_type, question, stop_words):
    '''
    Attempts to return a list of possible answers from the passage
    
    Answer types we need to support: PERSON, LOCATION, ORGANIZATION, NUMBER, DATE, FACT
    Answer types we currently support: PERSON, LOCATION, ORGANIZATION, NUMBER (very poorly), DATE (slightly less poorly)
    
    
    First attempt: For our first attempt we just use the nltk.ne_chunk to 
    crudely tag named entities. We then return all the entities equal to the answer type
    (Which is why we only support PERSON, LOCATION, ORGANIZATION)
    
    Second attempt: very similar to first attempt, but we include Number and Date. Numbers are found by
    using nltk to tag words since it has a tag for a number. We first however find relevenat snippets of the passage
    first. We do this as:
    foreach word in question that is not a stop_word:
        foreach occurrence of word in passage
            add [5 previous words, word, 5 next words] to snippet.
    '''
    answer_type = answer_type.upper()
    candidate_answers = []

    #Get tokens from question and remove those that are in the stop words
    question_tokens = nltk.word_tokenize(question)

    important_terms = [
        token for token in question_tokens if token.lower() not in stop_words
    ]
    snippets = []

    for term in important_terms:
        snippet_regex = r'((?:\S+\s*){,5})(%s)((?:\s*\S+\s*){,5})' % term  #Ganked this from some fool on Stack Overflow

        for snippet_match in re.finditer(snippet_regex, passage,
                                         re.IGNORECASE):
            (before, term, after) = snippet_match.group(1, 2, 3)
            snippets.append(before + term + after)
    ''' Problem: there could be duplicated snippets. For example if the query was 'Who was the oldest president'
    oldest and president would both be important_terms, so a phrase such as 'John Doe was the oldest president when he took office in 1994'
    would trigger snippets 'John Doe was the oldest president when he took office' and 'John Doe was the oldest president when he took office in'
    '''
    passage = " ".join(snippets)
    tokenized_passage = nltk.word_tokenize(passage)
    tagged_passage = nltk.pos_tag(tokenized_passage)

    if answer_type == "DATE":
        ''' Dates are pretty hard to extract because they occur in so many different manners. Common date forms:
            blah (1990 - 2000). 
            2010 (Four digit number)
            April 1st (Month followed by a day)
            1st of April
        '''

        month_map = {
            'Jan': 'January',
            'January': 'January',
            'Feb': 'February',
            'Febr': 'February',
            'February': 'February',
            'March': 'March',
            'Marc': 'March',
            'Mar': 'March',
            'April': 'April',
            'Apr': 'April',
            'May': 'May',
            'June': 'June',
            'Jun': 'June',
            'July': 'July',
            'Jul': 'July',
            'August': 'August',
            'Aug': 'August',
            'Sep': 'September',
            'Sept': 'September',
            'September': 'September',
            'October': 'October',
            'Oct': 'October',
            'November': 'November',
            'Nov': 'November',
            'Dec': 'December',
            'December': 'December'
        }

        #Attempts to match a month or month abbreviation followed by a day number
        #followed by a year
        month_date_year_regex = r'(Jan|January|Feb|Febr|February|March|Marc|Mar|' + \
                           r'Apr|April|May|June|Jun|Jul|July|Aug|August|' + \
                           r'Sept|Sep|September|October|Oct|Nov|November|Dec|December)' + \
                           r'[,.]?\s*(\d{1,2})(st|nd|rd|th)?,?\s*(\d{4})?' \

        year_regex = r'\D(\d{4})\D'  #Match non-digit, then 4 digits, then a non-digit

        for month_date_year_match in re.finditer(month_date_year_regex,
                                                 passage, re.IGNORECASE):
            (month, day, _, year) = month_date_year_match.group(1, 2, 3, 4)
            month = month_map[month]
            if day[0] == '0':
                day = day[1:]
            if day == '':
                continue

            date_list = [x for x in [month, day, year] if x is not None]

            candidate_answers.append(" ".join(date_list))

        for year in re.finditer(year_regex, passage):
            candidate_answers.append(year.group(1))  #Year is the group 1

    elif answer_type == 'FACT':
        pass
    elif answer_type == "NUMBER":
        grammar = r"""
        NUM : {<CD><NN.?>}
        {<CD>}
        """
        cp = nltk.RegexpParser(grammar)
        parse_tree = cp.parse(tagged_passage)
        for node in parse_tree:
            if hasattr(node, 'node'):  #Must be a number
                candidate_answers.append(" ".join(nltk.untag(node[:])))

    else:
        ne_passage = nltk.ne_chunk(tagged_passage)

        # Right now we change an answer type of LOCATION to GPE
        # since nltk is f*****g weird
        if answer_type == 'LOCATION': answer_type = 'GPE'

        for node in ne_passage:
            if hasattr(node, 'node'):  #Is a named entity
                if node.node == answer_type:
                    candidate_answers.append(" ".join(nltk.untag(node[:])))

    return candidate_answers