def gen_featsets(self, train_sents, rare_word_cutoff): """ Generates featuresets for each token in the training sentences. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. @type rare_word_cutoff: C{int} @param rare_word_cutoff: Words with less occurrences than C{rare_word_cutoff} will be treated differently by L{extract_feats} than non-rare words (cf. Ratnaparkhi 1996). @rtype: {list} of C{tuples} of (C{dict}, C{str}) @return: a list of tuples that contains the featureset of a token and its POS-tag. """ featuresets = [] for tagged_sent in train_sents: history = [] untagged_sent = untag(tagged_sent) for (i, (_word, tag)) in enumerate(tagged_sent): featuresets.append( (self.extract_feats(untagged_sent, i, history, rare_word_cutoff), tag) ) history.append(tag) return featuresets
def both_tags(): """ returns a dictionary, obj. obj is the graphson file dictionary. This adds a new field, "tagged_text" Which is a triple, the word, the NVD-tag, and the POS-tag """ File = path_metasploit obj_text = codecs.open(File, 'r', encoding='utf-8').read() obj = json.loads(obj_text) #obj["vertices"] = [obj["vertices"][474]] #obj["edges"] = [] for j in range(len(obj["vertices"])): print(j) V = obj["vertices"][j] if V["Metasploit-CVEid"] != '': t = V["Metasploit-Description"].split(' ') ID = V["Metasploit-CVEid"] T, keep = basic_tagger(t, ID) if keep == 1: T = secondary_tagger(T) # Returns first element in tagged sentence T S = nltk.untag(T) # Tag S with the part of speech of each element S = nltk.pos_tag(S) # Store tagged text as a triple: (word, NVD tag, POS tag) for i in range(len(T)): T[i] = (T[i][0], T[i][1], S[i][1]) obj["vertices"][j]["tagged_text"] = T if V["Metasploit-CVEid"] == '': obj["vertices"][j]["tagged_text"] = '' print("Done") return obj
def test(self, testsents): num = 0 numsent = corsent = numword = corword = 0.0 for row, sent in testsents.items(): # print '#', row untagged = untag(sent) history = self.viterbi(untagged) mistake = False numsent += 1 for (i, (word, tag)) in enumerate(sent): # print word, ' tag: ', tag, ' tagged: ', history[i] numword += 1 if tag == history[i]: corword += 1 else: mistake = True if mistake == False: corsent += 1 num += 1 if num > 20: break tokenacc = (corword / numword) * 100 tweetacc = (corsent / numsent) * 100 print 'Token Acc : ', tokenacc print 'Sent Acc : ', tweetacc return tokenacc, tweetacc
def train2(self, trainset, iterations=10, a0=1, rare_word_cutoff=5, rare_feat_cutoff=5): self.gen_feats(trainset, rare_word_cutoff, rare_feat_cutoff) self.M = len(self.featurenum) self.W = np.random.rand(self.M) W = self.W A = np.copy(W) for i in xrange(iterations): rate = a0 / (1 + sqrt(i)) prevnorm = la.norm(W) for (k, (row, sent)) in enumerate(trainset.items()): # print 'Iter %d Sample %d' % ( i, k ) untagged = untag(sent) gold = [tag for w, tag in sent] predict = self.viterbi(untagged) if predict != gold: for j, x in enumerate(self.X[k]): # promote gold for featind in x['f'][gold[j]]: W[featind] += rate # demote predicted for featind in x['f'][predict[j]]: W[featind] -= rate curnorm = la.norm(W) A = add(A, W) # print 'Train: iter ', i, ' prevnorm: ', prevnorm, ' curnorm: ', curnorm, ' del: ', abs(curnorm-prevnorm) self.W = np.copy(A / (iterations * len(self.W)))
def both_tags(): """ returns a dictionary, obj. obj is the graphson file dictionary. This adds a new field, "tagged_text" Which is a triple, the word, the NVD-tag, and the POS-tag """ File=path_metasploit obj_text = codecs.open(File, 'r', encoding='utf-8').read() obj = json.loads(obj_text) #obj["vertices"] = [obj["vertices"][474]] #obj["edges"] = [] for j in xrange(len(obj["vertices"])): print j V=obj["vertices"][j] if V["Metasploit-CVEid"] != '': t=V["Metasploit-Description"].split(' ') ID=V["Metasploit-CVEid"] T,keep=basic_tagger(t,ID) if keep == 1: T=secondary_tagger(T) # Returns first element in tagged sentence T S=nltk.untag(T) # Tag S with the part of speech of each element S=nltk.pos_tag(S) # Store tagged text as a triple: (word, NVD tag, POS tag) for i in range(len(T)): T[i]=(T[i][0], T[i][1], S[i][1]) obj["vertices"][j]["tagged_text"]=T if V["Metasploit-CVEid"] == '': obj["vertices"][j]["tagged_text"]='' print "Done" return obj
def evaluate(self, gold): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. :type gold: list(list(tuple(str, str))) :param gold: The list of tagged sentences to score the tagger on. :rtype: float """ tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = sum(gold, []) test_tokens = sum(tagged_sents, []) correct_tags = dict() total_tags = dict() print test_tokens for i in range(0,len(test_tokens)): tag = test_tokens[i][1] if tag in total_tags: total_tags[tag] += 1 else: total_tags[tag] = 1 if (gold_tokens[i][1] == test_tokens[i][1]): tag = test_tokens[i][1] if tag in correct_tags: correct_tags[tag] += 1 else: correct_tags[tag] = 1 for tag in correct_tags: print tag,':\t'+str(correct_tags[tag]/float(total_tags[tag]))
def extract_relationship(extraction_file): characters, dialogues = order_phrases(extraction_file) print(characters) #with open("MultiNaiveBayesClassifier", "rb") as f: # classifier = pickle.load(f) previous_speaker = None last_mentioned_entity = {"male": None, "female": None} relationship = [] #result = open("result", "w") i = 1 num_of_phrases = len(dialogues) for phrase in dialogues: speaker = phrase[0] entity = None attitude = None for sent in phrase[1]: word_tag = nltk.pos_tag(nltk.word_tokenize(sent)) mother_tree = nltk.ne_chunk(word_tag) for tree in mother_tree: if hasattr(tree, 'label') and tree.label: if tree.label() == 'PERSON': entity_name = ' '.join([child[0] for child in tree]) #print(entity_name) if entity_name.title() not in characters: continue last_mentioned_entity[characters[ entity_name.title()]] = entity_name.title() entity = entity_name.title() for pair in word_tag: if pair[1] == "PRN": if pair[0].lower() in ("he", "him"): entity = last_mentioned_entity["male"] elif pair[0].lower() in ("she", "her"): entity = last_mentioned_entity["female"] #feat = sc.find_features(nltk.untag(word_tag)) #attitude = classifier.classify(feat) color = sc.find_features(nltk.untag(word_tag)) #result.write("Speaks: {}\nTo: {}\nAttitude: {}\nSaid: {}\n\n".format(speaker, entity, attitude, sent)) #print("Speaks: {}\nTo: {}\nAttitude: {}\nSaid: {}\n\n".format(speaker, entity, attitude, sent)) relationship.append((speaker, entity, color)) print((i / num_of_phrases) * 100) i += 1 return relationship
def ConfusionMatrix(self, corpus_test): matrix = FreqDist() tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test]) testTokens = sum(corpus_test,[]) # real tags from the corpus taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used for tagged, test in izip(taggerTokens, testTokens ): if tagged != test : matrix.inc((tagged[1],test[1])) return matrix
def ConfusionMatrix(self, corpus_test): matrix = FreqDist() tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test]) testTokens = sum(corpus_test, []) # real tags from the corpus taggerTokens = sum(tagged_sents, []) # tags of the tagger that in used for tagged, test in izip(taggerTokens, testTokens): if tagged != test: matrix.inc((tagged[1], test[1])) return matrix
def __init__(self, train_sents): #[["","","",""],["","","",""]] train_sets = [] for tagged_sent in train_sents: untagged_sent = nltk.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): train_sets.append((pos_feature_tag(untagged_sent, i, history), tag)) history.append(tag) self.classifier = nltk.classify.NaiveBayesClassifier.train(train_sets)
def gen_featsets(self, train_sents, rare_word_cutoff): featuresets = [] for tagged_sent in train_sents: history = [] untagged_sent = untag(tagged_sent) for (i, (_word, tag)) in enumerate(tagged_sent): featuresets.append( (self.extract_feats(untagged_sent, i, history, rare_word_cutoff), tag) ) history.append(tag) return featuresets
def gen_feats(self, trainsents, rare_word_cutoff, rare_feat_cutoff=5): features = defaultdict(int) self.X = [] self.word_freqdist = self.gen_word_freqdist(trainsents) for row, sent in trainsents.items(): history = None untagged = untag(sent) # nltktags = pos_tag(untagged) # print nltktags for (i, (word, tag)) in enumerate(sent): x = dict() feature = self.extract_feat(untagged, i, history, rare_word_cutoff) # feature['nltk_tag'] = nltktags[i] x['features'] = feature x['tag'] = tag x['target_feat'] = self.phi(feature, tag) self.X.append(x) history = tag for f in x['target_feat']: features[f] += 1 self.featuresets = OrderedDict( sorted(features.items(), key=lambda t: t[1], reverse=True)) #cutoff rare features # self.featuresets = OrderedDict( (key, val) for (key, val) in self.featuresets.iteritems() if val > rare_feat_cutoff) self.featurenum = dict() # for f, count in self.featuresets.items(): # print f, count for (i, ((f, val), tag)) in enumerate(self.featuresets.iterkeys()): self.featurenum[((f, val), tag)] = i self.tags[tag] += 1 # print self.tags for x in self.X: x['f'] = dict() for tag in self.tags.iterkeys(): x['f'][tag] = self.getactivef(self.phi(x['features'], tag)) x['target_feat'] = x['f'][x['tag']] # print x['features'], x['target_feat'] print '#Features', len(self.featuresets)
def train(cls, train_sents, feature_extractor, classifier_cls, **kwargs): train_set = [] for tagged_sent in train_sents: untagged_sent = untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = feature_extractor(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) classifier = classifier_cls.train(train_set, **kwargs) return cls(feature_extractor, classifier)
def both_tags(): """ returns a dictionary, obj, with a key for each year. obj[year] is the graphson file dictionary. This adds a new field, "tagged_text", which is a triple, the word, the NVD-tag, and the POS-tag """ obj = {} files = os.listdir(path) #files = ['MSSecurityData90.graphson'] for file_num in range(0, len(files)): obj_text = codecs.open(path + files[file_num], 'r', encoding='utf-8').read() current_obj = json.loads(obj_text) obj[file_num] = current_obj for file_num in xrange(0, len(files)): print file_num print files[file_num] for j in xrange(len(obj[file_num]["vertices"])): obj[file_num]["vertices"][j]["tagged_text"] = [] t = "" # Combine the descriptions in a given file for description in [ 'MS-Description', 'MS-ExecutiveSummary', 'MS-ImpactDescription', 'MS-MitigationDescription', 'MS-TargetSetDescription', 'MS-Title', 'MS-WorkaroundDescription' ]: V = obj[file_num]["vertices"][j] if description in V.keys(): if description == 'MS-Title' or description == 'MS-ExecutiveSummary': V[description] = [V[description]] for i in range(0, len(V[description])): t = t + ' ' + V[description][i] # Perform the tagging if V["_id"] != '': t = t.split(' ') ID = V["_id"] T, keep = basic_tagger(t, ID) if keep == 1: # only keep it if it found a matching cpe vector T = secondary_tagger(T) S = nltk.untag(T) S = nltk.pos_tag(S) for i in range(len(T)): T[i] = (T[i][0], T[i][1], S[i][1]) obj[file_num]["vertices"][j]["tagged_text"].extend(T) print "done with file ", file_num print "Done" return obj
def MicroEvaluate(self,corpus_test): tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test])#tagger tagged testTokens = sum(corpus_test,[]) # real tags from the corpus taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used tags = [] #all possible tags------------------TODO for x in testTokens: w,t = x if not tags.__contains__(t): tags.append(t) fmeasure = 0 for tag in tags: fmeasure += calcFMeasur(tag, testTokens, taggerTokens) if len(tags) == 0: return 0 return fmeasure / len(tags)
def MicroEvaluate(self, corpus_test): tagged_sents = self.batch_tag([nltk.untag(sent) for sent in corpus_test]) #tagger tagged testTokens = sum(corpus_test, []) # real tags from the corpus taggerTokens = sum(tagged_sents, []) # tags of the tagger that in used tags = [] #all possible tags------------------TODO for x in testTokens: w, t = x if not tags.__contains__(t): tags.append(t) fmeasure = 0 for tag in tags: fmeasure += calcFMeasur(tag, testTokens, taggerTokens) if len(tags) == 0: return 0 return fmeasure / len(tags)
def getDifficultTags(tagger, testCorpus, x, tagsSet): difficultTags = [] precs = [] #defining which tags are we checking full or simplified tags if simplified -> getting the tagger and the testCorpus according to simplified tags set corpusTokens = sum(testCorpus, []) #calculating precision for each tag tagger_tags = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus]) taggedTokens = sum(tagger_tags, []) for t in tagsSet: p = calcPrec(t, corpusTokens, taggedTokens) precs.append((t, p)) #insert x lowest tags to difficultTags precs = sorted(precs, key=itemgetter(1)) for w, p in precs: if len(difficultTags) < x: difficultTags.append(w) return difficultTags
def getDifficultTags(tagger, testCorpus, x, tagsSet): difficultTags = [] precs = [] #defining which tags are we checking full or simplified tags if simplified -> getting the tagger and the testCorpus according to simplified tags set corpusTokens = sum(testCorpus, []) #calculating precision for each tag tagger_tags = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus]) taggedTokens = sum(tagger_tags, []) for t in tagsSet: p = calcPrec(t, corpusTokens, taggedTokens) precs.append((t,p)) #insert x lowest tags to difficultTags precs = sorted(precs, key=itemgetter(1)) for w,p in precs: if len(difficultTags) < x: difficultTags.append(w) return difficultTags
def both_tags(): """ returns a dictionary, obj, with a key for each year. obj[year] is the graphson file dictionary. This adds a new field, "tagged_text", which is a triple, the word, the NVD-tag, and the POS-tag """ obj = {} files = os.listdir(path) #files = ['MSSecurityData90.graphson'] for file_num in range(0,len(files)): obj_text = codecs.open(path+files[file_num], 'r', encoding='utf-8').read() current_obj = json.loads(obj_text) obj[file_num] = current_obj for file_num in xrange(0,len(files)): print file_num print files[file_num] for j in xrange(len(obj[file_num]["vertices"])): obj[file_num]["vertices"][j]["tagged_text"] = [] t = "" # Combine the descriptions in a given file for description in ['MS-Description','MS-ExecutiveSummary','MS-ImpactDescription','MS-MitigationDescription', 'MS-TargetSetDescription','MS-Title', 'MS-WorkaroundDescription']: V=obj[file_num]["vertices"][j] if description in V.keys(): if description == 'MS-Title' or description == 'MS-ExecutiveSummary': V[description]= [V[description]] for i in range(0,len(V[description])): t = t + ' ' + V[description][i] # Perform the tagging if V["_id"] != '': t=t.split(' ') ID=V["_id"] T,keep=basic_tagger(t,ID) if keep==1: # only keep it if it found a matching cpe vector T=secondary_tagger(T) S=nltk.untag(T) S=nltk.pos_tag(S) for i in range(len(T)): T[i]=(T[i][0], T[i][1], S[i][1]) obj[file_num]["vertices"][j]["tagged_text"].extend(T) print "done with file ", file_num print "Done" return obj
def evaluate(self, level=2): """ Evaluates the trained POSTagger model on test data - computes accuracy and frequency distribution of wrong predictions Argument: ---------- level (int): Type of tagger to be returned - '0' corresponds to default tagger, '1' corresponds to a unigram tagger, '2' corresponds to a bigram tagger and '3' corresponds to a trigram tagger, with each of the previous levels as backoffs Returns: -------- fd (nltk.FreqDist): Frequency Distribution of wrong predictions """ sentences, tagged_sentences = self.data_preparation() partition = int(len(tagged_sentences) * self.partition_ratio) train_set = tagged_sentences[:partition] test_set = tagged_sentences[partition:] print(len(train_set), len(test_set)) tagger = self.tagger(train_set, level) accuracy = tagger.evaluate(test_set) print(f'Accuracy is {accuracy}') predictions = [(word, tag) for sentence in test_set for (word, tag) in tagger.tag(nltk.untag(sentence))] wrong_predictions = [ (word, tag, actual) for ((word, tag), (_, actual)) in zip(predictions, [(w, t) for sentence in test_set for (w, t) in sentence]) if tag != actual and tag is not None ] fd = nltk.FreqDist(wrong_predictions) print('Performing analysis...') print('Frequency Distribution of wrong predictions...') return fd
def both_tags(): """ returns a dictionary, called "obj", with a key for each year. obj[year] is the graphson file dictionary. This adds a new field, "tagged_text" Which is a triple, the word, the NVD-tag, and the POS-tag """ File2010=path_nvd_2010 obj_text = codecs.open(File2010, 'r', encoding='utf-8').read() obj2010 = json.loads(obj_text) File2011=path_nvd_2011 obj_text = codecs.open(File2011, 'r', encoding='utf-8').read() obj2011 = json.loads(obj_text) File2012=path_nvd_2012 obj_text = codecs.open(File2012, 'r', encoding='utf-8').read() obj2012 = json.loads(obj_text) File2013=path_nvd_2013 obj_text = codecs.open(File2013, 'r', encoding='utf-8').read() obj2013 = json.loads(obj_text) obj={2010:obj2010, 2011:obj2011, 2012:obj2012, 2013:obj2013} for year in xrange(2010,2014): print year for j in xrange(len(obj[year]["vertices"])): print j V=obj[year]["vertices"][j] t=V["description"].split(' ') ID=V["_id"] T=basic_tagger(t,ID) T=secondary_tagger(T) S=nltk.untag(T) S=nltk.pos_tag(S) for i in range(len(T)): T[i]=(T[i][0], T[i][1], S[i][1]) obj[year]["vertices"][j]["tagged_text"]=T print "done with year ", year print "Done" return obj
def both_tags(): """ returns a dictionary, called "obj", with a key for each year. obj[year] is the graphson file dictionary. This adds a new field, "tagged_text" Which is a triple, the word, the NVD-tag, and the POS-tag """ File2010 = path_nvd_2010 obj_text = codecs.open(File2010, 'r', encoding='utf-8').read() obj2010 = json.loads(obj_text) File2011 = path_nvd_2011 obj_text = codecs.open(File2011, 'r', encoding='utf-8').read() obj2011 = json.loads(obj_text) File2012 = path_nvd_2012 obj_text = codecs.open(File2012, 'r', encoding='utf-8').read() obj2012 = json.loads(obj_text) File2013 = path_nvd_2013 obj_text = codecs.open(File2013, 'r', encoding='utf-8').read() obj2013 = json.loads(obj_text) obj = {2010: obj2010, 2011: obj2011, 2012: obj2012, 2013: obj2013} for year in range(2010, 2014): print(year) for j in range(len(obj[year]["vertices"])): print(j) V = obj[year]["vertices"][j] t = V["description"].split(' ') ID = V["_id"] T = basic_tagger(t, ID) T = secondary_tagger(T) S = nltk.untag(T) S = nltk.pos_tag(S) for i in range(len(T)): T[i] = (T[i][0], T[i][1], S[i][1]) obj[year]["vertices"][j]["tagged_text"] = T print("done with year ", year) print("Done") return obj
def mark_entities(tagged_sentence, entity_words, label): """ tagged_sentence: [('Word', 'Tag'), ...] entity_words: ['This', 'is', 'an', 'entity'] label: the entity type return a nltk.Tree instance with the entities wrapped in chunks """ iob_tagged = [(w, t, 'O') for w, t in tagged_sentence] words = nltk.untag(tagged_sentence) start_index = sub_list(words, entity_words) if start_index is not None: iob_tagged[start_index] = (iob_tagged[start_index][0], iob_tagged[start_index][1], 'B-' + label) for idx in range(1, len(entity_words)): iob_tagged[start_index + idx] = (iob_tagged[start_index + idx][0], iob_tagged[start_index + idx][1], 'I-' + label) return nltk.conlltags2tree(iob_tagged)
def replaceTextnumberWithNumber(text): tagged_number_words = 'ten/CD thousand/CD nine/CD hundred/CD ninety/CD eight/CD seven/CD six/CD five/CD four/CD three/CD two/CD one/CD eighty/CD seventy/CD sixty/CD fifty/CD forty/CD thirty/CD twenty/CD nineteen/CD eighteen/CD seventeen/CD sixteen/CD fifteen/CD fourteen/CD thirteen/CD twelve/CD eleven/CD zero/CD' tagged_number_words_tuples = [ nltk.tag.str2tuple(t) for t in tagged_number_words.split() ] my_tagger = nltk.UnigramTagger([tagged_number_words_tuples], backoff=nltk.DefaultTagger('IGNORE')) my_grammar = 'NumberWord: {<CD>+}' parser = nltk.RegexpParser(my_grammar) parsed = parser.parse(my_tagger.tag(nltk.word_tokenize(text.lower()))) for tag in [ tree.leaves() for tree in parsed.subtrees() if tree.label() == 'NumberWord' ]: ut = nltk.untag(tag) num = w2n.word_to_num(' '.join(ut)) r = re.compile(re.escape(' '.join(ut)), re.IGNORECASE) text = r.sub(str(num), text) return text
def test(self, testsents, clftype='argmax'): num = 0 numsent = corsent = numword = corword = 0.0 for row, sent in testsents.items(): # print '#', row untagged = untag(sent) if clftype == 'naive': history = self.naive_tagsent(untagged) else: history = self.argmax(untagged) mistake = False numsent += 1 for (i, (word, tag)) in enumerate(sent): # print word, ' tag: ', tag, ' tagged: ', history[i] numword += 1 if tag == history[i]: corword += 1 else: mistake = True if mistake == False: corsent += 1 # num += 1 # if num > 20: break tokenacc = (corword / numword) * 100 tweetacc = (corsent / numsent) * 100 print 'Token Acc : ', tokenacc print 'Tweet Acc : ', tweetacc return tokenacc, tweetacc
def extract_candidate_answers(passage, answer_type, question, stop_words): ''' Attempts to return a list of possible answers from the passage Answer types we need to support: PERSON, LOCATION, ORGANIZATION, NUMBER, DATE, FACT Answer types we currently support: PERSON, LOCATION, ORGANIZATION, NUMBER (very poorly), DATE (slightly less poorly) First attempt: For our first attempt we just use the nltk.ne_chunk to crudely tag named entities. We then return all the entities equal to the answer type (Which is why we only support PERSON, LOCATION, ORGANIZATION) Second attempt: very similar to first attempt, but we include Number and Date. Numbers are found by using nltk to tag words since it has a tag for a number. We first however find relevenat snippets of the passage first. We do this as: foreach word in question that is not a stop_word: foreach occurrence of word in passage add [5 previous words, word, 5 next words] to snippet. ''' answer_type = answer_type.upper() candidate_answers = [] #Get tokens from question and remove those that are in the stop words question_tokens = nltk.word_tokenize(question) important_terms = [token for token in question_tokens if token.lower() not in stop_words] snippets = [] for term in important_terms: snippet_regex = r'((?:\S+\s*){,5})(%s)((?:\s*\S+\s*){,5})' % term #Ganked this from some fool on Stack Overflow for snippet_match in re.finditer(snippet_regex, passage, re.IGNORECASE): (before, term, after) = snippet_match.group(1, 2, 3) snippets.append(before + term + after) ''' Problem: there could be duplicated snippets. For example if the query was 'Who was the oldest president' oldest and president would both be important_terms, so a phrase such as 'John Doe was the oldest president when he took office in 1994' would trigger snippets 'John Doe was the oldest president when he took office' and 'John Doe was the oldest president when he took office in' ''' passage = " ".join(snippets) tokenized_passage = nltk.word_tokenize(passage) tagged_passage =nltk.pos_tag(tokenized_passage) if answer_type == "DATE": ''' Dates are pretty hard to extract because they occur in so many different manners. Common date forms: blah (1990 - 2000). 2010 (Four digit number) April 1st (Month followed by a day) 1st of April ''' month_map = {'Jan' : 'January', 'January' : 'January', 'Feb' : 'February', 'Febr' : 'February', 'February' : 'February', 'March' : 'March', 'Marc' : 'March', 'Mar' : 'March', 'April' : 'April', 'Apr' : 'April', 'May' : 'May', 'June' : 'June', 'Jun' : 'June', 'July' : 'July', 'Jul' : 'July', 'August' : 'August', 'Aug' : 'August', 'Sep' : 'September', 'Sept' : 'September', 'September' : 'September', 'October' : 'October', 'Oct': 'October', 'November' : 'November', 'Nov' : 'November', 'Dec' : 'December', 'December' : 'December'} #Attempts to match a month or month abbreviation followed by a day number #followed by a year month_date_year_regex = r'(Jan|January|Feb|Febr|February|March|Marc|Mar|' + \ r'Apr|April|May|June|Jun|Jul|July|Aug|August|' + \ r'Sept|Sep|September|October|Oct|Nov|November|Dec|December)' + \ r'[,.]?\s*(\d{1,2})(st|nd|rd|th)?,?\s*(\d{4})?' \ year_regex = r'\D(\d{4})\D' #Match non-digit, then 4 digits, then a non-digit for month_date_year_match in re.finditer(month_date_year_regex, passage, re.IGNORECASE): (month, day, _, year) = month_date_year_match.group(1, 2, 3, 4) month = month_map[month] if day[0] == '0': day = day[1:] if day == '': continue date_list = [x for x in [month, day, year] if x is not None] candidate_answers.append(" ".join(date_list)) for year in re.finditer(year_regex, passage): candidate_answers.append(year.group(1)) #Year is the group 1 elif answer_type == 'FACT': pass elif answer_type == "NUMBER": grammar = r""" NUM : {<CD><NN.?>} {<CD>} """ cp = nltk.RegexpParser(grammar) parse_tree = cp.parse(tagged_passage) for node in parse_tree: if hasattr(node, 'node'): #Must be a number candidate_answers.append(" ".join(nltk.untag(node[:]))) else: ne_passage = nltk.ne_chunk(tagged_passage) # Right now we change an answer type of LOCATION to GPE # since nltk is f*****g weird if answer_type == 'LOCATION' : answer_type = 'GPE' for node in ne_passage: if hasattr(node, 'node'): #Is a named entity if node.node == answer_type: candidate_answers.append(" ".join(nltk.untag(node[:]))) return candidate_answers
def noneCount(simpleTaggedSents): # calculate number of unknown words noneCount = 0 # flatten test list flattenedSents = [ item for sublist in simpleTaggedSents for item in sublist ] noneCount = sum(1 for (word, tag) in flattenedSents if tag == "None") return noneCount print( "Unknown words: %d" % noneCount(simpleUnigramTagger.tag_sents(nltk.untag(sent) for sent in test))) # 3.1.4. Report the rate of unknown words per category. # In[19]: print("Unknown words (by category):") for c in brown.categories(): brown_sents = brown.tagged_sents(categories=c, tagset='universal') train_ = brown_sents[100:] test_ = brown_sents[:100] simpTag = SimpleUnigramTagger(train_) simpleTaggedSents = simpTag.tag_sents(nltk.untag(sent) for sent in test_) print("%s: %d" % (c, noneCount(simpleTaggedSents)))
def __fix_tag_scheme(self, sentence): untagged = nltk.untag(sentence) new_tags = nltk.pos_tag(untagged) return new_tags
def checkTaggerRecallForTag(tagger, tag, testCorpus): tagged_sents = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus])#tagger tagged testTokens = sum(testCorpus,[]) # real tags from the corpus taggerTokens = sum(tagged_sents,[]) # tags of the tagger that in used return calcRecall(tag, testTokens, taggerTokens)
def checkTaggerRecallForTag(tagger, tag, testCorpus): tagged_sents = tagger.batch_tag([nltk.untag(sent) for sent in testCorpus]) #tagger tagged testTokens = sum(testCorpus, []) # real tags from the corpus taggerTokens = sum(tagged_sents, []) # tags of the tagger that in used return calcRecall(tag, testTokens, taggerTokens)
corpus = SBCorpusReader(corpuspath) tagged_sents = corpus.tagged_sents() print(corpus.readme()) print("No. sentences:", len(tagged_sents)) print() print("* Separating training and testing data (NLTK book, sec 5.2)") breakpoint = int(len(tagged_sents) * 0.9) train_sents = tagged_sents[:breakpoint] test_sents = tagged_sents[breakpoint:] print("No. train sentences:", len(train_sents)) print("No. test sentences:", len(test_sents)) print() seen_example = nltk.untag(tagged_sents[10]) unseen_example = nltk.untag(tagged_sents[-10]) def show_example(tagged_sent): return " ".join(map(nltk.tuple2str, tagged_sent)) start_time = time.process_time() print("* Default tagger (NLTK book, sec 4.1)") tags = [tag for sent in train_sents for (word, tag) in sent] most_common_tag = nltk.FreqDist(tags).max() print("Most common tag:", most_common_tag) default_tagger = nltk.DefaultTagger(most_common_tag) print("Seen:", show_example(default_tagger.tag(seen_example))) print("Unseen:", show_example(default_tagger.tag(unseen_example)))
def extract_candidate_answers(passage, answer_type, question, stop_words): ''' Attempts to return a list of possible answers from the passage Answer types we need to support: PERSON, LOCATION, ORGANIZATION, NUMBER, DATE, FACT Answer types we currently support: PERSON, LOCATION, ORGANIZATION, NUMBER (very poorly), DATE (slightly less poorly) First attempt: For our first attempt we just use the nltk.ne_chunk to crudely tag named entities. We then return all the entities equal to the answer type (Which is why we only support PERSON, LOCATION, ORGANIZATION) Second attempt: very similar to first attempt, but we include Number and Date. Numbers are found by using nltk to tag words since it has a tag for a number. We first however find relevenat snippets of the passage first. We do this as: foreach word in question that is not a stop_word: foreach occurrence of word in passage add [5 previous words, word, 5 next words] to snippet. ''' answer_type = answer_type.upper() candidate_answers = [] #Get tokens from question and remove those that are in the stop words question_tokens = nltk.word_tokenize(question) important_terms = [ token for token in question_tokens if token.lower() not in stop_words ] snippets = [] for term in important_terms: snippet_regex = r'((?:\S+\s*){,5})(%s)((?:\s*\S+\s*){,5})' % term #Ganked this from some fool on Stack Overflow for snippet_match in re.finditer(snippet_regex, passage, re.IGNORECASE): (before, term, after) = snippet_match.group(1, 2, 3) snippets.append(before + term + after) ''' Problem: there could be duplicated snippets. For example if the query was 'Who was the oldest president' oldest and president would both be important_terms, so a phrase such as 'John Doe was the oldest president when he took office in 1994' would trigger snippets 'John Doe was the oldest president when he took office' and 'John Doe was the oldest president when he took office in' ''' passage = " ".join(snippets) tokenized_passage = nltk.word_tokenize(passage) tagged_passage = nltk.pos_tag(tokenized_passage) if answer_type == "DATE": ''' Dates are pretty hard to extract because they occur in so many different manners. Common date forms: blah (1990 - 2000). 2010 (Four digit number) April 1st (Month followed by a day) 1st of April ''' month_map = { 'Jan': 'January', 'January': 'January', 'Feb': 'February', 'Febr': 'February', 'February': 'February', 'March': 'March', 'Marc': 'March', 'Mar': 'March', 'April': 'April', 'Apr': 'April', 'May': 'May', 'June': 'June', 'Jun': 'June', 'July': 'July', 'Jul': 'July', 'August': 'August', 'Aug': 'August', 'Sep': 'September', 'Sept': 'September', 'September': 'September', 'October': 'October', 'Oct': 'October', 'November': 'November', 'Nov': 'November', 'Dec': 'December', 'December': 'December' } #Attempts to match a month or month abbreviation followed by a day number #followed by a year month_date_year_regex = r'(Jan|January|Feb|Febr|February|March|Marc|Mar|' + \ r'Apr|April|May|June|Jun|Jul|July|Aug|August|' + \ r'Sept|Sep|September|October|Oct|Nov|November|Dec|December)' + \ r'[,.]?\s*(\d{1,2})(st|nd|rd|th)?,?\s*(\d{4})?' \ year_regex = r'\D(\d{4})\D' #Match non-digit, then 4 digits, then a non-digit for month_date_year_match in re.finditer(month_date_year_regex, passage, re.IGNORECASE): (month, day, _, year) = month_date_year_match.group(1, 2, 3, 4) month = month_map[month] if day[0] == '0': day = day[1:] if day == '': continue date_list = [x for x in [month, day, year] if x is not None] candidate_answers.append(" ".join(date_list)) for year in re.finditer(year_regex, passage): candidate_answers.append(year.group(1)) #Year is the group 1 elif answer_type == 'FACT': pass elif answer_type == "NUMBER": grammar = r""" NUM : {<CD><NN.?>} {<CD>} """ cp = nltk.RegexpParser(grammar) parse_tree = cp.parse(tagged_passage) for node in parse_tree: if hasattr(node, 'node'): #Must be a number candidate_answers.append(" ".join(nltk.untag(node[:]))) else: ne_passage = nltk.ne_chunk(tagged_passage) # Right now we change an answer type of LOCATION to GPE # since nltk is f*****g weird if answer_type == 'LOCATION': answer_type = 'GPE' for node in ne_passage: if hasattr(node, 'node'): #Is a named entity if node.node == answer_type: candidate_answers.append(" ".join(nltk.untag(node[:]))) return candidate_answers