Python untag示例，nltk.untag Python示例

示例#1

0

显示文件

文件： mxpost.py 项目： idnaninitesh/Part-of-speech-tagging-with-discriminatively-re-ranked-Hidden-Markov-Models

    def gen_featsets(self, train_sents, rare_word_cutoff):
        """
        Generates featuresets for each token in the training sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences.

        @type rare_word_cutoff: C{int}
        @param rare_word_cutoff: Words with less occurrences than
        C{rare_word_cutoff} will be treated differently by L{extract_feats}
        than non-rare words (cf. Ratnaparkhi 1996).

        @rtype: {list} of C{tuples} of (C{dict}, C{str})
        @return:  a list of tuples that contains the featureset of
        a token and its POS-tag.
        """
        featuresets = []
        for tagged_sent in train_sents:
            history = []
            untagged_sent = untag(tagged_sent)
            for (i, (_word, tag)) in enumerate(tagged_sent):
                featuresets.append( (self.extract_feats(untagged_sent, i,
                    history, rare_word_cutoff), tag) )
                history.append(tag)
        return featuresets

示例#2

0

显示文件

文件： mxpost.py 项目： davidsbatista/minhash-classifier

    def gen_featsets(self, train_sents, rare_word_cutoff):
        """
        Generates featuresets for each token in the training sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences.

        @type rare_word_cutoff: C{int}
        @param rare_word_cutoff: Words with less occurrences than
        C{rare_word_cutoff} will be treated differently by L{extract_feats}
        than non-rare words (cf. Ratnaparkhi 1996).

        @rtype: {list} of C{tuples} of (C{dict}, C{str})
        @return:  a list of tuples that contains the featureset of
        a token and its POS-tag.
        """
        featuresets = []
        for tagged_sent in train_sents:
            history = []
            untagged_sent = untag(tagged_sent)
            for (i, (_word, tag)) in enumerate(tagged_sent):
                featuresets.append( (self.extract_feats(untagged_sent, i,
                    history, rare_word_cutoff), tag) )
                history.append(tag)
        return featuresets

示例#3

0

显示文件

文件： Metasploit_tagging.py 项目： BlackCowThrower/auto-labeled-corpus

def both_tags():
    """
    returns a dictionary, obj.  obj is 
    the graphson file dictionary.  This adds a new field, "tagged_text" 
    Which is a triple, the word, the NVD-tag, and the POS-tag
    """
    File = path_metasploit
    obj_text = codecs.open(File, 'r', encoding='utf-8').read()
    obj = json.loads(obj_text)
    #obj["vertices"] = [obj["vertices"][474]]
    #obj["edges"] = []

    for j in range(len(obj["vertices"])):
        print(j)
        V = obj["vertices"][j]
        if V["Metasploit-CVEid"] != '':
            t = V["Metasploit-Description"].split(' ')
            ID = V["Metasploit-CVEid"]
            T, keep = basic_tagger(t, ID)
            if keep == 1:
                T = secondary_tagger(T)
                # Returns first element in tagged sentence T
                S = nltk.untag(T)
                # Tag S with the part of speech of each element
                S = nltk.pos_tag(S)
                # Store tagged text as a triple: (word, NVD tag, POS tag)
                for i in range(len(T)):
                    T[i] = (T[i][0], T[i][1], S[i][1])
                obj["vertices"][j]["tagged_text"] = T
                if V["Metasploit-CVEid"] == '':
                    obj["vertices"][j]["tagged_text"] = ''
    print("Done")
    return obj

示例#4

0

显示文件

文件： struct_perceptron.py 项目： patwaria/pos-Tagger

 def test(self, testsents):
     num = 0 
     numsent = corsent = numword = corword = 0.0
     for row, sent in testsents.items():
         # print '#', row    
         untagged = untag(sent)
         history = self.viterbi(untagged)
         mistake = False
         numsent += 1
         for (i, (word, tag)) in enumerate(sent):
             # print word, ' tag: ', tag, ' tagged: ', history[i]
             numword += 1
             if tag == history[i]:
                 corword += 1
             else:
                 mistake = True
         if mistake == False:
             corsent += 1
                             
         num += 1
         if num > 20: break
     tokenacc =  (corword / numword) * 100
     tweetacc = (corsent / numsent) * 100
     print 'Token Acc : ', tokenacc
     print 'Sent Acc : ', tweetacc
     return tokenacc, tweetacc

示例#5

0

显示文件

文件： struct_perceptron.py 项目： patwaria/pos-Tagger

	def train2(self, trainset, iterations=10, a0=1, rare_word_cutoff=5, rare_feat_cutoff=5):
		
                self.gen_feats(trainset, rare_word_cutoff, rare_feat_cutoff)		
		self.M = len(self.featurenum)
		self.W = np.random.rand(self.M)
		W = self.W
                A = np.copy(W)
		for i in xrange(iterations):
			rate = a0 / (1 + sqrt(i))
			prevnorm = la.norm(W)	
			for (k, (row, sent)) in enumerate(trainset.items()):
                                # print 'Iter %d Sample %d' % ( i, k )
                                untagged = untag(sent)
				gold = [tag for w, tag in sent]
				predict = self.viterbi(untagged)
				if predict != gold:
					for j, x in enumerate(self.X[k]):
						# promote gold
						for featind in x['f'][gold[j]]:
							W[featind] += rate
						# demote predicted
						for featind in x['f'][predict[j]]:
							W[featind] -= rate
			curnorm = la.norm(W)
                        A = add(A, W)
			# print 'Train: iter ', i, ' prevnorm: ', prevnorm, ' curnorm: ', curnorm, ' del: ', abs(curnorm-prevnorm)
		self.W = np.copy(A / (iterations * len(self.W)))

示例#6

0

显示文件

文件： Metasploit_tagging.py 项目： stucco/auto-labeled-corpus

def both_tags():
    """
    returns a dictionary, obj.  obj is 
    the graphson file dictionary.  This adds a new field, "tagged_text" 
    Which is a triple, the word, the NVD-tag, and the POS-tag
    """
    File=path_metasploit
    obj_text = codecs.open(File, 'r', encoding='utf-8').read()
    obj = json.loads(obj_text)
    #obj["vertices"] = [obj["vertices"][474]]
    #obj["edges"] = []

    for j in xrange(len(obj["vertices"])):
        print j
        V=obj["vertices"][j]
        if V["Metasploit-CVEid"] != '':
            t=V["Metasploit-Description"].split(' ')
            ID=V["Metasploit-CVEid"]
            T,keep=basic_tagger(t,ID)
            if keep == 1:
                T=secondary_tagger(T)
                # Returns first element in tagged sentence T
                S=nltk.untag(T)
                # Tag S with the part of speech of each element
                S=nltk.pos_tag(S)
                # Store tagged text as a triple: (word, NVD tag, POS tag)
                for i in range(len(T)):
                    T[i]=(T[i][0], T[i][1], S[i][1])
                obj["vertices"][j]["tagged_text"]=T
                if V["Metasploit-CVEid"] == '':
                    obj["vertices"][j]["tagged_text"]=''
    print "Done"
    return obj

示例#7

0

显示文件

文件： mxpost.py 项目： davidsbatista/minhash-classifier

    def evaluate(self, gold):
            """
            Score the accuracy of the tagger against the gold standard.
            Strip the tags from the gold standard text, retag it using
            the tagger, then compute the accuracy score.

            :type gold: list(list(tuple(str, str)))
            :param gold: The list of tagged sentences to score the tagger on.
            :rtype: float
            """
            
            tagged_sents = self.tag_sents(untag(sent) for sent in gold)
            gold_tokens = sum(gold, [])
            test_tokens = sum(tagged_sents, [])

            correct_tags = dict()
            total_tags = dict()

            print test_tokens

            for i in range(0,len(test_tokens)):
                tag = test_tokens[i][1]
                if tag in total_tags:
                    total_tags[tag] += 1
                else: total_tags[tag] = 1

                if (gold_tokens[i][1] == test_tokens[i][1]):
                    tag = test_tokens[i][1]
                    if tag in correct_tags:
                        correct_tags[tag] += 1
                    else: correct_tags[tag] = 1

            for tag in correct_tags:
                print tag,':\t'+str(correct_tags[tag]/float(total_tags[tag]))

示例#8

0

显示文件

文件： analyzer.py 项目： Donnyyyyy/color_analyzing

def extract_relationship(extraction_file):
    characters, dialogues = order_phrases(extraction_file)
    print(characters)

    #with open("MultiNaiveBayesClassifier", "rb") as f:
    #	classifier = pickle.load(f)

    previous_speaker = None
    last_mentioned_entity = {"male": None, "female": None}
    relationship = []

    #result = open("result", "w")
    i = 1
    num_of_phrases = len(dialogues)
    for phrase in dialogues:
        speaker = phrase[0]
        entity = None
        attitude = None

        for sent in phrase[1]:
            word_tag = nltk.pos_tag(nltk.word_tokenize(sent))

            mother_tree = nltk.ne_chunk(word_tag)

            for tree in mother_tree:
                if hasattr(tree, 'label') and tree.label:
                    if tree.label() == 'PERSON':
                        entity_name = ' '.join([child[0] for child in tree])
                        #print(entity_name)

                        if entity_name.title() not in characters:
                            continue

                        last_mentioned_entity[characters[
                            entity_name.title()]] = entity_name.title()
                        entity = entity_name.title()

            for pair in word_tag:
                if pair[1] == "PRN":
                    if pair[0].lower() in ("he", "him"):
                        entity = last_mentioned_entity["male"]
                    elif pair[0].lower() in ("she", "her"):
                        entity = last_mentioned_entity["female"]

            #feat = sc.find_features(nltk.untag(word_tag))
            #attitude = classifier.classify(feat)
            color = sc.find_features(nltk.untag(word_tag))

            #result.write("Speaks: {}\nTo: {}\nAttitude: {}\nSaid: {}\n\n".format(speaker, entity, attitude, sent))
            #print("Speaks: {}\nTo: {}\nAttitude: {}\nSaid: {}\n\n".format(speaker, entity, attitude, sent))

        relationship.append((speaker, entity, color))

        print((i / num_of_phrases) * 100)

        i += 1

    return relationship

示例#9

0

显示文件

文件： q3_3.py 项目： atiassa/recommend-2011

def ConfusionMatrix(self, corpus_test):
    matrix = FreqDist()
    tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test])
    testTokens = sum(corpus_test,[]) # real tags from the corpus
    taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used
    for tagged, test in izip(taggerTokens, testTokens ):
        if tagged != test :
            matrix.inc((tagged[1],test[1]))
    return matrix

示例#10

0

显示文件

文件： q3_3.py 项目： atiassa/recommend-2011

def ConfusionMatrix(self, corpus_test):
    matrix = FreqDist()
    tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test])
    testTokens = sum(corpus_test, [])  # real tags from the corpus
    taggerTokens = sum(tagged_sents, [])  # tags of the tagger that in used
    for tagged, test in izip(taggerTokens, testTokens):
        if tagged != test:
            matrix.inc((tagged[1], test[1]))
    return matrix

示例#11

0

显示文件

文件： nltkDemo01.py 项目： hufanglei875535215/python_project

 def __init__(self, train_sents):  #[["","","",""],["","","",""]]
     train_sets = []
     for tagged_sent in train_sents:
         untagged_sent = nltk.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             train_sets.append((pos_feature_tag(untagged_sent, i,
                                                history), tag))
             history.append(tag)
     self.classifier = nltk.classify.NaiveBayesClassifier.train(train_sets)

示例#12

0

显示文件

文件： part_of_speech_tagging.py 项目： BillTheBest/tf_core

 def gen_featsets(self, train_sents, rare_word_cutoff):
     featuresets = []
     for tagged_sent in train_sents:
         history = []
         untagged_sent = untag(tagged_sent)
         for (i, (_word, tag)) in enumerate(tagged_sent):
             featuresets.append( (self.extract_feats(untagged_sent, i,
                 history, rare_word_cutoff), tag) )
             history.append(tag)
     return featuresets

示例#13

0

显示文件

    def gen_feats(self, trainsents, rare_word_cutoff, rare_feat_cutoff=5):
        features = defaultdict(int)
        self.X = []

        self.word_freqdist = self.gen_word_freqdist(trainsents)
        for row, sent in trainsents.items():
            history = None
            untagged = untag(sent)
            # nltktags = pos_tag(untagged)
            # print nltktags
            for (i, (word, tag)) in enumerate(sent):
                x = dict()
                feature = self.extract_feat(untagged, i, history,
                                            rare_word_cutoff)
                # feature['nltk_tag'] = nltktags[i]
                x['features'] = feature
                x['tag'] = tag
                x['target_feat'] = self.phi(feature, tag)

                self.X.append(x)
                history = tag

                for f in x['target_feat']:
                    features[f] += 1

        self.featuresets = OrderedDict(
            sorted(features.items(), key=lambda t: t[1], reverse=True))

        #cutoff rare features
        # self.featuresets = OrderedDict( (key, val) for (key, val) in self.featuresets.iteritems() if val > rare_feat_cutoff)
        self.featurenum = dict()

        # for f, count in self.featuresets.items():
        # print f, count

        for (i, ((f, val), tag)) in enumerate(self.featuresets.iterkeys()):

            self.featurenum[((f, val), tag)] = i
            self.tags[tag] += 1

        # print self.tags

        for x in self.X:
            x['f'] = dict()
            for tag in self.tags.iterkeys():
                x['f'][tag] = self.getactivef(self.phi(x['features'], tag))

            x['target_feat'] = x['f'][x['tag']]
            # print x['features'], x['target_feat']

        print '#Features', len(self.featuresets)

示例#14

0

显示文件

文件： read_emails.py 项目： siva600/chatbot-1

    def train(cls, train_sents, feature_extractor, classifier_cls, **kwargs):
        train_set = []

        for tagged_sent in train_sents:
            untagged_sent = untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                featureset = feature_extractor(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)

        classifier = classifier_cls.train(train_set, **kwargs)
        return cls(feature_extractor, classifier)

示例#15

0

显示文件

文件： MS_bulletin_tagging.py 项目： wjddn803/auto-labeled-corpus

def both_tags():
    """
    returns a dictionary, obj, with a key for each year.  obj[year] is 
    the graphson file dictionary.  This adds a new field, "tagged_text", 
    which is a triple, the word, the NVD-tag, and the POS-tag
    """
    obj = {}
    files = os.listdir(path)
    #files = ['MSSecurityData90.graphson']

    for file_num in range(0, len(files)):
        obj_text = codecs.open(path + files[file_num], 'r',
                               encoding='utf-8').read()
        current_obj = json.loads(obj_text)
        obj[file_num] = current_obj

    for file_num in xrange(0, len(files)):
        print file_num
        print files[file_num]
        for j in xrange(len(obj[file_num]["vertices"])):
            obj[file_num]["vertices"][j]["tagged_text"] = []
            t = ""
            # Combine the descriptions in a given file
            for description in [
                    'MS-Description', 'MS-ExecutiveSummary',
                    'MS-ImpactDescription', 'MS-MitigationDescription',
                    'MS-TargetSetDescription', 'MS-Title',
                    'MS-WorkaroundDescription'
            ]:
                V = obj[file_num]["vertices"][j]
                if description in V.keys():
                    if description == 'MS-Title' or description == 'MS-ExecutiveSummary':
                        V[description] = [V[description]]
                    for i in range(0, len(V[description])):
                        t = t + ' ' + V[description][i]
            # Perform the tagging
            if V["_id"] != '':
                t = t.split(' ')
                ID = V["_id"]
                T, keep = basic_tagger(t, ID)
                if keep == 1:  # only keep it if it found a matching cpe vector
                    T = secondary_tagger(T)
                    S = nltk.untag(T)
                    S = nltk.pos_tag(S)
                    for i in range(len(T)):
                        T[i] = (T[i][0], T[i][1], S[i][1])
                    obj[file_num]["vertices"][j]["tagged_text"].extend(T)
        print "done with file ", file_num
    print "Done"
    return obj

示例#16

0

显示文件

文件： q3_2.py 项目： atiassa/recommend-2011

def MicroEvaluate(self,corpus_test):
    tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test])#tagger tagged
    testTokens = sum(corpus_test,[]) # real tags from the corpus
    taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used
    tags = [] #all possible tags------------------TODO
    for x in testTokens:
        w,t = x
        if not tags.__contains__(t):
            tags.append(t)
    fmeasure = 0
    for tag in tags:
        fmeasure += calcFMeasur(tag, testTokens, taggerTokens)
    if len(tags) == 0:
        return 0
    return fmeasure / len(tags)

示例#17

0

显示文件

文件： q3_2.py 项目： atiassa/recommend-2011

def MicroEvaluate(self, corpus_test):
    tagged_sents = self.batch_tag([nltk.untag(sent)
                                   for sent in corpus_test])  #tagger tagged
    testTokens = sum(corpus_test, [])  # real tags from the corpus
    taggerTokens = sum(tagged_sents, [])  # tags of the tagger that in used
    tags = []  #all possible tags------------------TODO
    for x in testTokens:
        w, t = x
        if not tags.__contains__(t):
            tags.append(t)
    fmeasure = 0
    for tag in tags:
        fmeasure += calcFMeasur(tag, testTokens, taggerTokens)
    if len(tags) == 0:
        return 0
    return fmeasure / len(tags)

示例#18

0

显示文件

文件： q3_2.py 项目： atiassa/recommend-2011

def getDifficultTags(tagger, testCorpus, x, tagsSet):
    difficultTags = []
    precs = []
    #defining which tags are we checking full or simplified tags if simplified -> getting the tagger and the testCorpus according to simplified tags set
    corpusTokens = sum(testCorpus, [])
    #calculating precision for each tag
    tagger_tags = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus])
    taggedTokens = sum(tagger_tags, [])
    for t in tagsSet:
        p = calcPrec(t, corpusTokens, taggedTokens)
        precs.append((t, p))
    #insert x lowest tags to difficultTags
    precs = sorted(precs, key=itemgetter(1))
    for w, p in precs:
        if len(difficultTags) < x:
            difficultTags.append(w)
    return difficultTags

示例#19

0

显示文件

文件： q3_2.py 项目： atiassa/recommend-2011

def getDifficultTags(tagger, testCorpus, x, tagsSet):
    difficultTags = []
    precs = []
    #defining which tags are we checking full or simplified tags if simplified -> getting the tagger and the testCorpus according to simplified tags set    
    corpusTokens = sum(testCorpus, [])
    #calculating precision for each tag
    tagger_tags = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus])
    taggedTokens = sum(tagger_tags, [])
    for t in tagsSet:
        p = calcPrec(t, corpusTokens, taggedTokens)
        precs.append((t,p))
    #insert x lowest tags to difficultTags
    precs = sorted(precs, key=itemgetter(1))
    for w,p in precs:
        if len(difficultTags) < x:
            difficultTags.append(w)     
    return difficultTags

示例#20

0

显示文件

文件： MS_bulletin_tagging.py 项目： stucco/auto-labeled-corpus

def both_tags():
    """
    returns a dictionary, obj, with a key for each year.  obj[year] is 
    the graphson file dictionary.  This adds a new field, "tagged_text", 
    which is a triple, the word, the NVD-tag, and the POS-tag
    """
    obj = {}
    files = os.listdir(path)
    #files = ['MSSecurityData90.graphson']
    
    for file_num in range(0,len(files)):
        obj_text = codecs.open(path+files[file_num], 'r', encoding='utf-8').read()
        current_obj = json.loads(obj_text)
        obj[file_num] = current_obj

    for file_num in xrange(0,len(files)):
        print file_num
        print files[file_num]
        for j in xrange(len(obj[file_num]["vertices"])):
            obj[file_num]["vertices"][j]["tagged_text"] = []
            t = ""
            # Combine the descriptions in a given file
            for description in ['MS-Description','MS-ExecutiveSummary','MS-ImpactDescription','MS-MitigationDescription',
                                'MS-TargetSetDescription','MS-Title', 'MS-WorkaroundDescription']:
                V=obj[file_num]["vertices"][j]
                if description in V.keys():
                    if description == 'MS-Title' or description == 'MS-ExecutiveSummary':
                        V[description]= [V[description]]
                    for i in range(0,len(V[description])):
                        t = t + ' ' + V[description][i]
            # Perform the tagging         
            if V["_id"] != '':
                t=t.split(' ')
                ID=V["_id"]
                T,keep=basic_tagger(t,ID)
                if keep==1: # only keep it if it found a matching cpe vector
                    T=secondary_tagger(T)
                    S=nltk.untag(T)
                    S=nltk.pos_tag(S)
                    for i in range(len(T)):
                        T[i]=(T[i][0], T[i][1], S[i][1])
                    obj[file_num]["vertices"][j]["tagged_text"].extend(T)
        print "done with file ", file_num
    print "Done"
    return obj

示例#21

0

显示文件

文件： hw4.py 项目： aashishyadavally/MS_AI_Coursework

    def evaluate(self, level=2):
        """
        Evaluates the trained POSTagger model on test data - computes accuracy
        and frequency distribution of wrong predictions

        Argument:
        ----------
            level (int):
                Type of tagger to be returned - '0' corresponds to
        default tagger, '1' corresponds to a unigram tagger, '2'
        corresponds to a bigram tagger and '3' corresponds to a
        trigram tagger, with each of the previous levels as backoffs

        Returns:
        --------
            fd (nltk.FreqDist):
                Frequency Distribution of wrong predictions
        """
        sentences, tagged_sentences = self.data_preparation()
        partition = int(len(tagged_sentences) * self.partition_ratio)
        train_set = tagged_sentences[:partition]
        test_set = tagged_sentences[partition:]
        print(len(train_set), len(test_set))
        tagger = self.tagger(train_set, level)
        accuracy = tagger.evaluate(test_set)

        print(f'Accuracy is {accuracy}')
        predictions = [(word, tag) for sentence in test_set
                       for (word, tag) in tagger.tag(nltk.untag(sentence))]
        wrong_predictions = [
            (word, tag, actual)
            for ((word, tag),
                 (_, actual)) in zip(predictions, [(w, t)
                                                   for sentence in test_set
                                                   for (w, t) in sentence])
            if tag != actual and tag is not None
        ]
        fd = nltk.FreqDist(wrong_predictions)
        print('Performing analysis...')
        print('Frequency Distribution of wrong predictions...')
        return fd

示例#22

0

显示文件

文件： NVD_tagging.py 项目： stucco/auto-labeled-corpus

def both_tags():
	"""
	returns a dictionary, called "obj", with a key for each year.  obj[year] is 
	the graphson file dictionary.  This adds a new field, "tagged_text" 
	Which is a triple, the word, the NVD-tag, and the POS-tag
	"""
	File2010=path_nvd_2010	
	obj_text = codecs.open(File2010, 'r', encoding='utf-8').read()
	obj2010 = json.loads(obj_text)

	File2011=path_nvd_2011
	obj_text = codecs.open(File2011, 'r', encoding='utf-8').read()
	obj2011 = json.loads(obj_text)

	File2012=path_nvd_2012
	obj_text = codecs.open(File2012, 'r', encoding='utf-8').read()
	obj2012 = json.loads(obj_text)

	File2013=path_nvd_2013
	obj_text = codecs.open(File2013, 'r', encoding='utf-8').read()
	obj2013 = json.loads(obj_text)

	obj={2010:obj2010, 2011:obj2011, 2012:obj2012, 2013:obj2013}

	for year in xrange(2010,2014):
		print year
		for j in xrange(len(obj[year]["vertices"])):
			print j
			V=obj[year]["vertices"][j]
			t=V["description"].split(' ')
			ID=V["_id"]
			T=basic_tagger(t,ID)
			T=secondary_tagger(T)
			S=nltk.untag(T)
			S=nltk.pos_tag(S)
			for i in range(len(T)):
				T[i]=(T[i][0], T[i][1], S[i][1])
			obj[year]["vertices"][j]["tagged_text"]=T
		print "done with year ", year
	print "Done"
	return obj

示例#23

0

显示文件

def both_tags():
    """
	returns a dictionary, called "obj", with a key for each year.  obj[year] is 
	the graphson file dictionary.  This adds a new field, "tagged_text" 
	Which is a triple, the word, the NVD-tag, and the POS-tag
	"""
    File2010 = path_nvd_2010
    obj_text = codecs.open(File2010, 'r', encoding='utf-8').read()
    obj2010 = json.loads(obj_text)

    File2011 = path_nvd_2011
    obj_text = codecs.open(File2011, 'r', encoding='utf-8').read()
    obj2011 = json.loads(obj_text)

    File2012 = path_nvd_2012
    obj_text = codecs.open(File2012, 'r', encoding='utf-8').read()
    obj2012 = json.loads(obj_text)

    File2013 = path_nvd_2013
    obj_text = codecs.open(File2013, 'r', encoding='utf-8').read()
    obj2013 = json.loads(obj_text)

    obj = {2010: obj2010, 2011: obj2011, 2012: obj2012, 2013: obj2013}

    for year in range(2010, 2014):
        print(year)
        for j in range(len(obj[year]["vertices"])):
            print(j)
            V = obj[year]["vertices"][j]
            t = V["description"].split(' ')
            ID = V["_id"]
            T = basic_tagger(t, ID)
            T = secondary_tagger(T)
            S = nltk.untag(T)
            S = nltk.pos_tag(S)
            for i in range(len(T)):
                T[i] = (T[i][0], T[i][1], S[i][1])
            obj[year]["vertices"][j]["tagged_text"] = T
        print("done with year ", year)
    print("Done")
    return obj

示例#24

0

显示文件

文件： chatbot.py 项目： lswh/pytorchstudy

def mark_entities(tagged_sentence, entity_words, label):
    """
    tagged_sentence: [('Word', 'Tag'), ...]
    entity_words: ['This', 'is', 'an', 'entity']
    label: the entity type

    return a nltk.Tree instance with the entities wrapped in chunks
    """

    iob_tagged = [(w, t, 'O') for w, t in tagged_sentence]

    words = nltk.untag(tagged_sentence)
    start_index = sub_list(words, entity_words)
    if start_index is not None:
        iob_tagged[start_index] = (iob_tagged[start_index][0],
                                   iob_tagged[start_index][1], 'B-' + label)
        for idx in range(1, len(entity_words)):
            iob_tagged[start_index + idx] = (iob_tagged[start_index + idx][0],
                                             iob_tagged[start_index + idx][1],
                                             'I-' + label)

    return nltk.conlltags2tree(iob_tagged)

示例#25

0

显示文件

文件： helpers.py 项目： Gunni/redditSub2Kindle

def replaceTextnumberWithNumber(text):
    tagged_number_words = 'ten/CD thousand/CD nine/CD hundred/CD ninety/CD eight/CD seven/CD six/CD five/CD four/CD three/CD two/CD one/CD eighty/CD seventy/CD sixty/CD fifty/CD forty/CD thirty/CD twenty/CD nineteen/CD eighteen/CD seventeen/CD sixteen/CD fifteen/CD fourteen/CD thirteen/CD twelve/CD eleven/CD zero/CD'
    tagged_number_words_tuples = [
        nltk.tag.str2tuple(t) for t in tagged_number_words.split()
    ]
    my_tagger = nltk.UnigramTagger([tagged_number_words_tuples],
                                   backoff=nltk.DefaultTagger('IGNORE'))

    my_grammar = 'NumberWord: {<CD>+}'
    parser = nltk.RegexpParser(my_grammar)
    parsed = parser.parse(my_tagger.tag(nltk.word_tokenize(text.lower())))

    for tag in [
            tree.leaves() for tree in parsed.subtrees()
            if tree.label() == 'NumberWord'
    ]:
        ut = nltk.untag(tag)
        num = w2n.word_to_num(' '.join(ut))

        r = re.compile(re.escape(' '.join(ut)), re.IGNORECASE)
        text = r.sub(str(num), text)

    return text

示例#26

0

显示文件

    def test(self, testsents, clftype='argmax'):
        num = 0
        numsent = corsent = numword = corword = 0.0

        for row, sent in testsents.items():
            # print '#', row
            untagged = untag(sent)

            if clftype == 'naive':
                history = self.naive_tagsent(untagged)
            else:
                history = self.argmax(untagged)

            mistake = False

            numsent += 1
            for (i, (word, tag)) in enumerate(sent):
                # print word, ' tag: ', tag, ' tagged: ', history[i]
                numword += 1
                if tag == history[i]:
                    corword += 1
                else:
                    mistake = True

            if mistake == False:
                corsent += 1

            # num += 1
            # if num > 20: break

        tokenacc = (corword / numword) * 100
        tweetacc = (corsent / numsent) * 100
        print 'Token Acc : ', tokenacc
        print 'Tweet Acc : ', tweetacc

        return tokenacc, tweetacc

示例#27

0

显示文件

文件： answer_extraction.py 项目： wkiffer/QA

def extract_candidate_answers(passage, answer_type, question, stop_words):
    '''
    Attempts to return a list of possible answers from the passage
    
    Answer types we need to support: PERSON, LOCATION, ORGANIZATION, NUMBER, DATE, FACT
    Answer types we currently support: PERSON, LOCATION, ORGANIZATION, NUMBER (very poorly), DATE (slightly less poorly)
    
    
    First attempt: For our first attempt we just use the nltk.ne_chunk to 
    crudely tag named entities. We then return all the entities equal to the answer type
    (Which is why we only support PERSON, LOCATION, ORGANIZATION)
    
    Second attempt: very similar to first attempt, but we include Number and Date. Numbers are found by
    using nltk to tag words since it has a tag for a number. We first however find relevenat snippets of the passage
    first. We do this as:
    foreach word in question that is not a stop_word:
        foreach occurrence of word in passage
            add [5 previous words, word, 5 next words] to snippet.
    '''
    answer_type = answer_type.upper()
    candidate_answers = []
    
    #Get tokens from question and remove those that are in the stop words
    question_tokens = nltk.word_tokenize(question)
    
    important_terms = [token for token in question_tokens if token.lower() not in stop_words]
    snippets = []
    
    for term in important_terms:
        snippet_regex = r'((?:\S+\s*){,5})(%s)((?:\s*\S+\s*){,5})' % term #Ganked this from some fool on Stack Overflow
        
        for snippet_match in re.finditer(snippet_regex, passage, re.IGNORECASE):
            (before, term, after) = snippet_match.group(1, 2, 3)
            snippets.append(before + term + after)
            
    
    ''' Problem: there could be duplicated snippets. For example if the query was 'Who was the oldest president'
    oldest and president would both be important_terms, so a phrase such as 'John Doe was the oldest president when he took office in 1994'
    would trigger snippets 'John Doe was the oldest president when he took office' and 'John Doe was the oldest president when he took office in'
    '''
    passage = " ".join(snippets)
    tokenized_passage = nltk.word_tokenize(passage)
    tagged_passage =nltk.pos_tag(tokenized_passage)
    
    if answer_type == "DATE":
        ''' Dates are pretty hard to extract because they occur in so many different manners. Common date forms:
            blah (1990 - 2000). 
            2010 (Four digit number)
            April 1st (Month followed by a day)
            1st of April
        '''
        
        month_map = {'Jan' : 'January', 'January' : 'January', 'Feb' : 'February', 'Febr' : 'February', 
                     'February' : 'February', 'March' : 'March', 'Marc' : 'March', 'Mar' : 'March',
                     'April' : 'April', 'Apr' : 'April', 'May' : 'May', 'June' : 'June', 'Jun' : 'June',
                     'July' : 'July', 'Jul' : 'July', 'August' : 'August', 'Aug' : 'August',
                      'Sep' : 'September', 'Sept' : 'September', 'September' : 'September',
                      'October' : 'October', 'Oct': 'October', 'November' : 'November', 'Nov' : 'November',
                      'Dec' : 'December', 'December' : 'December'}
        
        #Attempts to match a month or month abbreviation followed by a day number
        #followed by a year
        month_date_year_regex = r'(Jan|January|Feb|Febr|February|March|Marc|Mar|' + \
                           r'Apr|April|May|June|Jun|Jul|July|Aug|August|' + \
                           r'Sept|Sep|September|October|Oct|Nov|November|Dec|December)' + \
                           r'[,.]?\s*(\d{1,2})(st|nd|rd|th)?,?\s*(\d{4})?' \
                           
        year_regex = r'\D(\d{4})\D' #Match non-digit, then 4 digits, then a non-digit
        
        for month_date_year_match in re.finditer(month_date_year_regex, passage, re.IGNORECASE):
            (month, day, _, year) = month_date_year_match.group(1, 2, 3, 4)
            month = month_map[month]
            if day[0] == '0':
                day = day[1:]
            if day == '':
                continue
            
            date_list = [x for x in [month, day, year] if x is not None]
            
            candidate_answers.append(" ".join(date_list))
            
        for year in re.finditer(year_regex, passage):
            candidate_answers.append(year.group(1)) #Year is the group 1
        
    elif answer_type == 'FACT':
        pass
    elif answer_type == "NUMBER":
        grammar = r"""
        NUM : {<CD><NN.?>}
        {<CD>}
        """
        cp = nltk.RegexpParser(grammar)
        parse_tree = cp.parse(tagged_passage)
        for node in parse_tree:
            if hasattr(node, 'node'): #Must be a number
                candidate_answers.append(" ".join(nltk.untag(node[:])))
        
    
    else:
        ne_passage = nltk.ne_chunk(tagged_passage)
        
        # Right now we change an answer type of LOCATION to GPE
        # since nltk is f*****g weird
        if answer_type == 'LOCATION' : answer_type = 'GPE'
        
        for node in ne_passage:
            if hasattr(node, 'node'): #Is a named entity
                if node.node == answer_type:
                    candidate_answers.append(" ".join(nltk.untag(node[:])))
    
    return candidate_answers

示例#28

0

显示文件

文件： hw1.py 项目： ibrahemi1994/NLP171


def noneCount(simpleTaggedSents):
    # calculate number of unknown words
    noneCount = 0
    # flatten test list
    flattenedSents = [
        item for sublist in simpleTaggedSents for item in sublist
    ]
    noneCount = sum(1 for (word, tag) in flattenedSents if tag == "None")
    return noneCount


print(
    "Unknown words: %d" %
    noneCount(simpleUnigramTagger.tag_sents(nltk.untag(sent)
                                            for sent in test)))

# 3.1.4. Report the rate of unknown words per category.

# In[19]:

print("Unknown words (by category):")
for c in brown.categories():
    brown_sents = brown.tagged_sents(categories=c, tagset='universal')
    train_ = brown_sents[100:]
    test_ = brown_sents[:100]
    simpTag = SimpleUnigramTagger(train_)
    simpleTaggedSents = simpTag.tag_sents(nltk.untag(sent) for sent in test_)
    print("%s: %d" % (c, noneCount(simpleTaggedSents)))

示例#29

0

显示文件

文件： distinctness.py 项目： rschmaelzle/memorability

 def __fix_tag_scheme(self, sentence):
     untagged = nltk.untag(sentence)
     new_tags = nltk.pos_tag(untagged)
     return new_tags

示例#30

0

显示文件

文件： q3_2.py 项目： atiassa/recommend-2011

def checkTaggerRecallForTag(tagger, tag, testCorpus):
    tagged_sents = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus])#tagger tagged
    testTokens = sum(testCorpus,[]) # real tags from the corpus
    taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used
    return calcRecall(tag, testTokens, taggerTokens)

示例#31

0

显示文件

文件： q3_2.py 项目： atiassa/recommend-2011

def checkTaggerRecallForTag(tagger, tag, testCorpus):
    tagged_sents = tagger.batch_tag([nltk.untag(sent)
                                     for sent in testCorpus])  #tagger tagged
    testTokens = sum(testCorpus, [])  # real tags from the corpus
    taggerTokens = sum(tagged_sents, [])  # tags of the tagger that in used
    return calcRecall(tag, testTokens, taggerTokens)

示例#32

0

显示文件

文件： sb_postagger_test.py 项目： spraakbanken/sb-nltk-tools

corpus = SBCorpusReader(corpuspath)
tagged_sents = corpus.tagged_sents()
print(corpus.readme())
print("No. sentences:", len(tagged_sents))
print()

print("* Separating training and testing data (NLTK book, sec 5.2)")
breakpoint = int(len(tagged_sents) * 0.9)
train_sents = tagged_sents[:breakpoint]
test_sents = tagged_sents[breakpoint:]
print("No. train sentences:", len(train_sents))
print("No. test  sentences:", len(test_sents))
print()

seen_example = nltk.untag(tagged_sents[10])
unseen_example = nltk.untag(tagged_sents[-10])


def show_example(tagged_sent):
    return " ".join(map(nltk.tuple2str, tagged_sent))


start_time = time.process_time()
print("* Default tagger (NLTK book, sec 4.1)")
tags = [tag for sent in train_sents for (word, tag) in sent]
most_common_tag = nltk.FreqDist(tags).max()
print("Most common tag:", most_common_tag)
default_tagger = nltk.DefaultTagger(most_common_tag)
print("Seen:", show_example(default_tagger.tag(seen_example)))
print("Unseen:", show_example(default_tagger.tag(unseen_example)))

示例#33

0

显示文件

def extract_candidate_answers(passage, answer_type, question, stop_words):
    '''
    Attempts to return a list of possible answers from the passage
    
    Answer types we need to support: PERSON, LOCATION, ORGANIZATION, NUMBER, DATE, FACT
    Answer types we currently support: PERSON, LOCATION, ORGANIZATION, NUMBER (very poorly), DATE (slightly less poorly)
    
    
    First attempt: For our first attempt we just use the nltk.ne_chunk to 
    crudely tag named entities. We then return all the entities equal to the answer type
    (Which is why we only support PERSON, LOCATION, ORGANIZATION)
    
    Second attempt: very similar to first attempt, but we include Number and Date. Numbers are found by
    using nltk to tag words since it has a tag for a number. We first however find relevenat snippets of the passage
    first. We do this as:
    foreach word in question that is not a stop_word:
        foreach occurrence of word in passage
            add [5 previous words, word, 5 next words] to snippet.
    '''
    answer_type = answer_type.upper()
    candidate_answers = []

    #Get tokens from question and remove those that are in the stop words
    question_tokens = nltk.word_tokenize(question)

    important_terms = [
        token for token in question_tokens if token.lower() not in stop_words
    ]
    snippets = []

    for term in important_terms:
        snippet_regex = r'((?:\S+\s*){,5})(%s)((?:\s*\S+\s*){,5})' % term  #Ganked this from some fool on Stack Overflow

        for snippet_match in re.finditer(snippet_regex, passage,
                                         re.IGNORECASE):
            (before, term, after) = snippet_match.group(1, 2, 3)
            snippets.append(before + term + after)
    ''' Problem: there could be duplicated snippets. For example if the query was 'Who was the oldest president'
    oldest and president would both be important_terms, so a phrase such as 'John Doe was the oldest president when he took office in 1994'
    would trigger snippets 'John Doe was the oldest president when he took office' and 'John Doe was the oldest president when he took office in'
    '''
    passage = " ".join(snippets)
    tokenized_passage = nltk.word_tokenize(passage)
    tagged_passage = nltk.pos_tag(tokenized_passage)

    if answer_type == "DATE":
        ''' Dates are pretty hard to extract because they occur in so many different manners. Common date forms:
            blah (1990 - 2000). 
            2010 (Four digit number)
            April 1st (Month followed by a day)
            1st of April
        '''

        month_map = {
            'Jan': 'January',
            'January': 'January',
            'Feb': 'February',
            'Febr': 'February',
            'February': 'February',
            'March': 'March',
            'Marc': 'March',
            'Mar': 'March',
            'April': 'April',
            'Apr': 'April',
            'May': 'May',
            'June': 'June',
            'Jun': 'June',
            'July': 'July',
            'Jul': 'July',
            'August': 'August',
            'Aug': 'August',
            'Sep': 'September',
            'Sept': 'September',
            'September': 'September',
            'October': 'October',
            'Oct': 'October',
            'November': 'November',
            'Nov': 'November',
            'Dec': 'December',
            'December': 'December'
        }

        #Attempts to match a month or month abbreviation followed by a day number
        #followed by a year
        month_date_year_regex = r'(Jan|January|Feb|Febr|February|March|Marc|Mar|' + \
                           r'Apr|April|May|June|Jun|Jul|July|Aug|August|' + \
                           r'Sept|Sep|September|October|Oct|Nov|November|Dec|December)' + \
                           r'[,.]?\s*(\d{1,2})(st|nd|rd|th)?,?\s*(\d{4})?' \

        year_regex = r'\D(\d{4})\D'  #Match non-digit, then 4 digits, then a non-digit

        for month_date_year_match in re.finditer(month_date_year_regex,
                                                 passage, re.IGNORECASE):
            (month, day, _, year) = month_date_year_match.group(1, 2, 3, 4)
            month = month_map[month]
            if day[0] == '0':
                day = day[1:]
            if day == '':
                continue

            date_list = [x for x in [month, day, year] if x is not None]

            candidate_answers.append(" ".join(date_list))

        for year in re.finditer(year_regex, passage):
            candidate_answers.append(year.group(1))  #Year is the group 1

    elif answer_type == 'FACT':
        pass
    elif answer_type == "NUMBER":
        grammar = r"""
        NUM : {<CD><NN.?>}
        {<CD>}
        """
        cp = nltk.RegexpParser(grammar)
        parse_tree = cp.parse(tagged_passage)
        for node in parse_tree:
            if hasattr(node, 'node'):  #Must be a number
                candidate_answers.append(" ".join(nltk.untag(node[:])))

    else:
        ne_passage = nltk.ne_chunk(tagged_passage)

        # Right now we change an answer type of LOCATION to GPE
        # since nltk is f*****g weird
        if answer_type == 'LOCATION': answer_type = 'GPE'

        for node in ne_passage:
            if hasattr(node, 'node'):  #Is a named entity
                if node.node == answer_type:
                    candidate_answers.append(" ".join(nltk.untag(node[:])))

    return candidate_answers