def most_informative_features(self, n=100): """ Return a list of the 'most informative' features used by this classifier. For the purpose of this function, the informativeness of a feature C{(fname,fval)} is equal to the highest value of P(fname=fval|label), for any label, divided by the lowest value of P(fname=fval|label), for any label:: max[ P(fname=fval|label1) / P(fname=fval|label2) ] """ # The set of (fname, fval) pairs used by this classifier. features = set() # The max & min probability associated w/ each (fname, fval) # pair. Maps (fname,fval) -> float. maxprob = defaultdict(lambda: 0.0) minprob = defaultdict(lambda: 1.0) for (label, fname), probdist in list(self._feature_probdist.items()): for fval in probdist.samples(): feature = (fname, fval) features.add( feature ) p = probdist.prob(fval) maxprob[feature] = max(p, maxprob[feature]) minprob[feature] = min(p, minprob[feature]) if minprob[feature] == 0: features.discard(feature) # Convert features to a list, & sort it by how informative # features are. features = sorted(features, key=lambda feature: minprob[feature]/maxprob[feature]) return features[:n]
def __init__(self): self.feature_weights = defaultdict(lambda: ('default',0)) self.trained = False self.largest = defaultdict(lambda: ('default name', 'default value',0)) self.stopset = set(stopwords.words('english')) self.my_feats = self.bigram_word_feats self.train(self.my_feats) #self.book_train(self.my_feats) self.calculate_weights() self.train(self.my_feats)
def __init__(self): self.feature_weights = defaultdict(lambda: ('default', 0)) self.trained = False self.largest = defaultdict(lambda: ('default name', 'default value', 0)) self.stopset = set(stopwords.words('english')) self.my_feats = self.bigram_word_feats self.train(self.my_feats) #self.book_train(self.my_feats) self.calculate_weights() self.train(self.my_feats)
def train(self, feats): print "Starting to train the data" start = datetime.datetime.now() print "setting the ids", datetime.datetime.now() self.negids = movie_reviews.fileids('neg') self.posids = movie_reviews.fileids('pos') #random.shuffle(self.negids) #random.shuffle(self.posids) ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] + ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids]) ##random.shuffle(self.reviews) ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:]) ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4]) print "setting the feats", datetime.datetime.now() self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids] self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids] self.negcutoff = len(self.negfeats)*3/4 self.poscutoff = len(self.posfeats)*3/4 print "setting the train/test", datetime.datetime.now() self.trainfeats = self.negfeats[:self.negcutoff] + self.posfeats[:self.poscutoff] self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[self.poscutoff:] print "training", datetime.datetime.now() self.classifier = NaiveBayesClassifier.train(self.trainfeats) ##self.classifier = NaiveBayesClassifier.train(self.train_set) self.refsets = defaultdict(set) self.testsets = defaultdict(set) print "accuracy stuff", datetime.datetime.now() for i, (feats, label) in enumerate(self.testfeats): ##for i, (feats, label) in enumerate(self.test_set): self.refsets[label].add(i) observed = self.classifier.classify(feats) self.testsets[observed].add(i) end = datetime.datetime.now() print "Training lasted for ", end-start print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats) ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set) print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos']) print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos']) print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg']) print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg']) self.classifier.show_most_informative_features() self.trained = True
def __init__(self, tokens, key=lambda x:x): """ Construct a new concordance index. @param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurance. @param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use C{key=lambda s:s.lower()}, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False, feature_values=None, verbose=False): """ @param binary: If true, then treat all feature/value pairs a individual binary features, rather than using a single n-way branch for each feature. """ # Collect a list of all feature names. feature_names = set() for featureset, label in labeled_featuresets: for fname in featureset: feature_names.add(fname) # Collect a list of the values each feature can take. if feature_values is None and binary: feature_values = defaultdict(set) for featureset, label in labeled_featuresets: for fname, fval in featureset.items(): feature_values[fname].add(fval) # Start with a stump. if not binary: tree = DecisionTreeClassifier.best_stump( feature_names, labeled_featuresets, verbose) else: tree = DecisionTreeClassifier.best_binary_stump( feature_names, labeled_featuresets, feature_values, verbose) # Refine the stump. tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1, support_cutoff, binary, feature_values, verbose) # Return it return tree
def mk_reldicts(pairs, window=5, trace=0): """ Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which stores information about the subject and object NEs plus the filler between them. Additionally, a left and right context of length =< window are captured (within a given input sentence). @param pairs: a pair of list(str) and L{Tree}, as generated by @param window: a threshold for the number of items to include in the left and right context @type window: C{int} @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' @rtype: C{list} of C{defaultdict} """ result = [] while len(pairs) > 2: reldict = defaultdict(str) reldict['lcon'] = _join(pairs[0][0][-window:]) reldict['subjclass'] = pairs[0][1].node reldict['subjtext'] = _join(pairs[0][1].leaves()) reldict['subjsym'] = list2sym(pairs[0][1].leaves()) reldict['filler'] = _join(pairs[1][0]) reldict['objclass'] = pairs[1][1].node reldict['objtext'] = _join(pairs[1][1].leaves()) reldict['objsym'] = list2sym(pairs[1][1].leaves()) reldict['rcon'] = _join(pairs[2][0][:window]) if trace: print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass']) result.append(reldict) pairs = pairs[1:] return result
def _attempt_proof(self, clauses): #map indices to lists of indices, to store attempted unifications tried = defaultdict(list) i = 0 while i < len(clauses): if not clauses[i].is_tautology(): #since we try clauses in order, we should start after the last #index tried if tried[i]: j = tried[i][-1] + 1 else: j = i + 1 #nothing tried yet for 'i', so start with the next while j < len(clauses): #don't: 1) unify a clause with itself, # 2) use tautologies if i != j and j and not clauses[j].is_tautology(): tried[i].append(j) newclauses = clauses[i].unify(clauses[j]) if newclauses: for newclause in newclauses: newclause._parents = (i + 1, j + 1) clauses.append(newclause) if not len(newclause ): #if there's an empty clause return (True, clauses) i = -1 #since we added a new clause, restart from the top break j += 1 i += 1 return (False, clauses)
def invert_dict(d): from nltk.compat import defaultdict inverted_dict = defaultdict(list) for key in d: for term in d[key]: inverted_dict[term].append(key) return inverted_dict
def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False, feature_values=None, verbose=False): """ @param binary: If true, then treat all feature/value pairs a individual binary features, rather than using a single n-way branch for each feature. """ # Collect a list of all feature names. feature_names = set() for featureset, label in labeled_featuresets: for fname in featureset: feature_names.add(fname) # Collect a list of the values each feature can take. if feature_values is None and binary: feature_values = defaultdict(set) for featureset, label in labeled_featuresets: for fname, fval in list(featureset.items()): feature_values[fname].add(fval) # Start with a stump. if not binary: tree = DecisionTreeClassifier.best_stump( feature_names, labeled_featuresets, verbose) else: tree = DecisionTreeClassifier.best_binary_stump( feature_names, labeled_featuresets, feature_values, verbose) # Refine the stump. tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1, support_cutoff, binary, feature_values, verbose) # Return it return tree
def __init__(self, tokens, key=lambda x: x): """ Construct a new concordance index. @param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurance. @param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use C{key=lambda s:s.lower()}, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
def _attempt_proof(self, clauses): #map indices to lists of indices, to store attempted unifications tried = defaultdict(list) i = 0 while i < len(clauses): if not clauses[i].is_tautology(): #since we try clauses in order, we should start after the last #index tried if tried[i]: j = tried[i][-1] + 1 else: j = i + 1 #nothing tried yet for 'i', so start with the next while j < len(clauses): #don't: 1) unify a clause with itself, # 2) use tautologies if i != j and j and not clauses[j].is_tautology(): tried[i].append(j) newclauses = clauses[i].unify(clauses[j]) if newclauses: for newclause in newclauses: newclause._parents = (i+1, j+1) clauses.append(newclause) if not len(newclause): #if there's an empty clause return (True, clauses) i=-1 #since we added a new clause, restart from the top break j += 1 i += 1 return (False, clauses)
def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: print w, c, self._context_to_words[c][word], self._context_to_words[c][w] scores[w] += self._context_to_words[c][word] * self._context_to_words[c][w] return sorted(scores, key=scores.get)[:n]
def train(labeled_featuresets, estimator=ELEProbDist): """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occured, given # the label and featurename. for featureset, label in labeled_featuresets: label_freqdist.inc(label) for fname, fval in list(featureset.items()): # Increment freq(fval|label, fname) feature_freqdist[label, fname].inc(fval) # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() feature_freqdist[label, fname].inc(None, num_samples-count) feature_values[fname].add(None) # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in list(feature_freqdist.items()): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label,fname] = probdist return NaiveBayesClassifier(label_probdist, feature_probdist)
def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: print w, c, self._context_to_words[c][ word], self._context_to_words[c][w] scores[w] += self._context_to_words[c][ word] * self._context_to_words[c][w] return sorted(scores, key=scores.get)[:n]
def invert_dict(d): from nltk.compat import defaultdict inverted_dict = defaultdict(list) for key in d: if hasattr(d[key], '__iter__'): for term in d[key]: inverted_dict[term].append(key) else: inverted_dict[d[key]] = key return inverted_dict
def _make_predicate_dict(self, assumptions): """ Create a dictionary of predicates from the assumptions. @param assumptions: a C{list} of C{Expression}s @return: C{dict} mapping C{AbstractVariableExpression} to C{PredHolder} """ predicates = defaultdict(PredHolder) for a in assumptions: self._map_predicates(a, predicates) return predicates
def analyze_entity_judgments(site): ''' Returns a mapping { entity ID -> { candidate link -> (num turkers judged candidate relevant, num turkers judged it irrelevant) }} ''' judgments = {} # a mapping of turker id -> {candidate title -> true/false judgment} # for each candidate annotated by the turker annotator_decisions = defaultdict(list) row_num = 0 rows_plus_headers = csv_util.query_csv_for_rows(__entities_results_csv_path__, False) for row in rows_plus_headers: try: if row_num==0: # row 0 is header entity_id_col = row.index('Input.entity_id') candidate_link_col = row.index('Input.candidate_link') turkerID_col = row.index('WorkerId') answer_col = row.index('Answer.Q1') else: judged_entity_id = row[entity_id_col] if judged_entity_id in judgments: selected_candidates = judgments[judged_entity_id] else: selected_candidates = {} selected_candidate_title = wikipedia_api_util.get_page_title_from_url(row[candidate_link_col]) if selected_candidate_title in selected_candidates: (num_true, num_false) = selected_candidates[selected_candidate_title] else: (num_true, num_false) = (0,0) judgment = row[answer_col] if judgment=='true': num_true = num_true+1 else: num_false = num_false+1 selected_candidates[selected_candidate_title] = (num_true, num_false) judgments[judged_entity_id] = selected_candidates turkerID = row[turkerID_col] annotator_decisions[turkerID].append({selected_candidate_title:judgment}) row_num = row_num+1 except: continue # just ignore a problematic row # Cache each annotator's decisions for later inter-rater agreement calculations entity_dataset_mgr.save_annotator_decisions(annotator_decisions, site) print "Cached a total of "+str(len(judgments))+" entities judged by human Mechanical Turk annotators" entity_dataset_mgr.save_entity_judgements(judgments, site) return judgments
def stump(feature_name, labeled_featuresets): label = FreqDist([label for (featureset, label) in labeled_featuresets]).max() # Find the best label for each value. freqs = defaultdict(FreqDist) # freq(label|value) for featureset, label in labeled_featuresets: feature_value = featureset[feature_name] freqs[feature_value].inc(label) decisions = dict([(val, DecisionTreeClassifier(freqs[val].max())) for val in freqs]) return DecisionTreeClassifier(label, feature_name, decisions)
def stump(feature_name, labeled_featuresets): label = FreqDist( [label for (featureset, label) in labeled_featuresets]).max() # Find the best label for each value. freqs = defaultdict(FreqDist) # freq(label|value) for featureset, label in labeled_featuresets: feature_value = featureset.get(featurename) freqs[feature_value].inc(label) decisions = dict([(val, DecisionTreeClassifier(freqs[val].max())) for val in freqs]) return DecisionTreeClassifier(label, feature_name, decisions)
def _init(self): self._f2c = defaultdict(set) self._c2f = defaultdict(set) if self._pattern is not None: for file_id in self._fileids: category = re.match(self._pattern, file_id).group(1) self._add(file_id, category) elif self._map is not None: for (file_id, categories) in self._map.items(): for category in categories: self._add(file_id, category) elif self._file is not None: for line in self.open(self._file).readlines(): line = line.strip() file_id, categories = line.split(self._delimiter, 1) if file_id not in self.fileids(): raise ValueError('In category mapping file %s: %s ' 'not found' % (self._file, file_id)) for category in categories.split(self._delimiter): self._add(file_id, category)
def createSenseTree(self, senseList): ''' create parse tree for all senses in senseList eg: {'conduct': ['institution', 'to', 'business'], 'ROOT': ['created'], 'institution': ['an'], 'created': ['institution', 'conduct']} ''' senseDict = [] depParsed = parseSenses(senseList) for dep in depParsed: temp = defaultdict( list ) for n ,v in dep: n = stemWords(n) v = stemWords(v) temp[n].append(v) senseDict.append(temp) return senseDict
def page_from_reference(href): ''' Returns a tuple of the HTML page built and the new current word @param href: The hypertext reference to be solved @type href: str @return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word @rtype: A tuple (str,str) ''' word = href.word pos_forms = defaultdict(list) words = word.split(',') words = [ w for w in [w.strip().lower().replace(' ', '_') for w in words] if w != "" ] if len(words) == 0: # No words were found. return "", "Please specify a word to search for." # This looks up multiple words at once. This is probably not # necessary and may lead to problems. for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]: form = wn.morphy(w, pos) if form and form not in pos_forms[pos]: pos_forms[pos].append(form) body = '' for pos, pos_str, name in _pos_tuples(): if pos in pos_forms: body += _hlev(3, name) + '\n' for w in pos_forms[pos]: # Not all words of exc files are in the database, skip # to the next word if a KeyError is raised. try: body += _collect_all_synsets(w, pos, href.synset_relations) except KeyError: pass if not body: body = "The word or words '%s' where not found in the dictonary." % word return body, word
def page_from_reference(href): ''' Returns a tuple of the HTML page built and the new current word @param href: The hypertext reference to be solved @type href: str @return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word @rtype: A tuple (str,str) ''' word = href.word pos_forms = defaultdict(list) words = word.split(',') words = [w for w in [w.strip().lower().replace(' ', '_') for w in words] if w != ""] if len(words) == 0: # No words were found. return "", "Please specify a word to search for." # This looks up multiple words at once. This is probably not # necessary and may lead to problems. for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]: form = wn.morphy(w, pos) if form and form not in pos_forms[pos]: pos_forms[pos].append(form) body = '' for pos,pos_str,name in _pos_tuples(): if pos in pos_forms: body += _hlev(3, name) + '\n' for w in pos_forms[pos]: # Not all words of exc files are in the database, skip # to the next word if a KeyError is raised. try: body += _collect_all_synsets(w, pos, href.synset_relations) except KeyError: pass if not body: body = "The word or words '%s' where not found in the dictonary." % word return body, word
def dictOfDicts(): return defaultdict(dictOfDicts)
def get_resolved_ambiguous_entities(): """ Returns the ambiguous entities for which the intended meaning has been unanimously resolved by human annotators. """ all_entities = defaultdict(list) correct_meaning_label = "Y" row_count = -1 labeled_entities_dataset = csv_util.query_csv_for_rows("labeled_data/entities.csv", False) for candidate_row in labeled_entities_dataset: row_count = row_count + 1 if row_count == 0: # header row surfaceform_col = candidate_row.index("surface_form") shorttext_col = candidate_row.index("short_text") candidate_meaning_col = candidate_row.index("candidate_meaning") candidate_label_col = candidate_row.index("candidate_is_relevant") userkey_col = candidate_row.index("user_key") continue # use "surfaceform_shorttext" as ID for entity surfaceform = candidate_row[surfaceform_col] shorttext = candidate_row[shorttext_col] entity_id = surfaceform + "_" + shorttext meaning = candidate_row[candidate_meaning_col] label = candidate_row[candidate_label_col] userkey = candidate_row[userkey_col] all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey)) # test if entity is ambiguous (i.e., has more than one candidate meaning) and # if so if entity has been resolved (i.e., has at least one candidate labeled # as the intended meaning) resolved_entities = {} for entity in all_entities: entity_tuple_list = all_entities[entity] if len(entity_tuple_list) < 2: continue candidate_meanings = [] intended_meanings = [] user = None for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list: # title of a potential meaning of the ambiguous entity if not meaning in candidate_meanings: candidate_meanings.append(meaning) # annotated label indicating whether this candidate # meaning is the intended meaning of the entity if label == correct_meaning_label and not meaning in intended_meanings: intended_meanings.append(meaning) if user is None: user = userkey if len(intended_meanings) > 1 and len(intended_meanings) > 0 and user != None: # this entity is ambiguous, has been manually resolved, # and we know the user who wrote it entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user) entity_id = entity_obj.get_id() resolved_entities[entity_id] = entity_obj return resolved_entities
def get_resolved_ambiguous_entities(): ''' Returns the ambiguous entities for which the intended meaning has been unanimously resolved by human annotators. ''' all_entities = defaultdict(list) correct_meaning_label = 'Y' row_count = -1 labeled_entities_dataset = csv_util.query_csv_for_rows('labeled_data/entities.csv', False) for candidate_row in labeled_entities_dataset: row_count = row_count+1 if row_count==0: # header row surfaceform_col = candidate_row.index('surface_form') shorttext_col = candidate_row.index('short_text') candidate_meaning_col = candidate_row.index('candidate_meaning') candidate_label_col = candidate_row.index('candidate_is_relevant') userkey_col = candidate_row.index('user_key') continue # use "surfaceform_shorttext" as ID for entity surfaceform = candidate_row[surfaceform_col] shorttext = candidate_row[shorttext_col] entity_id = surfaceform+'_'+shorttext meaning = candidate_row[candidate_meaning_col] label = candidate_row[candidate_label_col] userkey = candidate_row[userkey_col] all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey)) # test if entity is ambiguous (i.e., has more than one candidate meaning) and # if so if entity has been resolved (i.e., has at least one candidate labeled # as the intended meaning) resolved_entities = {} for entity in all_entities: entity_tuple_list = all_entities[entity] if len(entity_tuple_list) < 2: continue candidate_meanings = [] intended_meanings = [] user = None for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list: # title of a potential meaning of the ambiguous entity if not meaning in candidate_meanings: candidate_meanings.append(meaning) # annotated label indicating whether this candidate # meaning is the intended meaning of the entity if label==correct_meaning_label and not meaning in intended_meanings: intended_meanings.append(meaning) if user is None: user = userkey if len(intended_meanings)>1 and len(intended_meanings)>0 and user!=None: # this entity is ambiguous, has been manually resolved, # and we know the user who wrote it entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user) entity_id = entity_obj.get_id() resolved_entities[entity_id] = entity_obj return resolved_entities
def make_tweet_entities_csv_for_turk(): twitter_site = short_text_websites.get_twitter_site() entities_to_evaluate = entity_dataset_mgr.get_valid_ne_candidates(twitter_site) if entities_to_evaluate is None: print "No ambiguous entities + candidates in cache. Run run_all_dataset_generators "+\ "script and choose to first fetch and store more entities from short texts." return judged_row_plus_headers = csv_util.query_csv_for_rows(__entities_results_csv_path__, False) judged_row_num = 0 already_judged = [] # list of (entity id, candidate link) for judge_row in judged_row_plus_headers: try: if judged_row_num==0: # row 0 is header entity_id_col = judge_row.index('Input.entity_id') candidate_link_col = judge_row.index('Input.candidate_link') else: judged_tuple = (judge_row[entity_id_col], judge_row[candidate_link_col]) if not judged_tuple in already_judged: already_judged.append(judged_tuple) judged_row_num = judged_row_num+1 except: continue # just ignore a problematic row # Determine what entity+candidate tasks we actually want to write to a spreadsheet # and send to mturk since we don't have resources for unlimited mturk tasks tasks = {} # NamedEntity object -> candidate judgment tasks we actually want performed user_entities = defaultdict(list) # username -> [NamedEntity obj] done_shorttexts = [] # list of shorttext id random.shuffle(entities_to_evaluate) # so we get a random subset of a user's entities for ne_obj in entities_to_evaluate: # "40 nouns usually enough to establish statistically significant # differences between WSD algorithms" (Santamaria et al., 2010) username = ne_obj.username if len(user_entities[username]) > 50: continue # have enough entities for this user # limiting our dataset to one named entity per short text shorttext_id = ne_obj.shorttext_id if shorttext_id in done_shorttexts: continue # no need to create tasks for candidates we already have annotator judgments for entity_id = ne_obj.get_entity_id() candidate_URLs = ne_obj.get_candidate_wikiURLs() valid_candidate_tasks = [] for candidate_URL in candidate_URLs: if ((entity_id, candidate_URL) in already_judged): continue valid_candidate_tasks.append(candidate_URL) if len(valid_candidate_tasks)==0: continue # already have annotator judgments for all of this entity's candidates if len(candidate_URLs)+len(valid_candidate_tasks) < 2: # this would be a non-ambiguous entity, and we should never reach this # point because such entities should have been filtered out by now raise tasks[entity_id] = valid_candidate_tasks user_entities[username].append(ne_obj) done_shorttexts.append(shorttext_id) # put valid entities + candidates in the spreadsheet until reach our limit of tasks task_max = 1400 rows = [] headers = ['entity_id', 'short_text', 'ambiguous_entity', 'candidate_link'] rows.append(headers) for username in user_entities: # add users until reach our limit on the number of tasks we can afford, # but break at this point in the loop rather than in the inner loop to # ensure that we do have at least 50 entities per user (even if this # means we go over our task limit a little in order to reach that amount) if len(rows) > task_max: break # bypass users who haven't written the minimum number of valid entities # required to establish statistical significance between the algorithms if len(user_entities[username]) < 50: continue # should be 50 NamedEntity objects per user, and we'll make tasks for their candidates for ne_obj in user_entities[username]: entity_id = ne_obj.get_entity_id() # make sure the entity presented to a Turker looks the same as # it appears in the short text (ie with the same capitalization) original_shorttext = ne_obj.shorttext_str.decode('latin-1') surface_form = ne_obj.surface_form if not surface_form in original_shorttext: surface_form = __match_appearance__(surface_form, original_shorttext) # shuffle candidates so that they don't appear # in wikiminer's/dbpedia's ranking order and bias the turker candidate_URLs = tasks[entity_id] random.shuffle(candidate_URLs) choices = candidate_URLs[:] # copy (list slicing) for choice in choices: # make a separate row for each candidate link # rather than putting all links in a single cell row = [entity_id, original_shorttext, surface_form, choice] rows.append(row) if len(rows)%50==0: # write the rows every once in a while in case we reach an error print "Updating spreadsheet..."+str(len(rows)) csv_util.write_to_spreadsheet(__entities_to_judge_csv_path__, rows) # dump to csv csv_util.write_to_spreadsheet(__entities_to_judge_csv_path__, rows)
def train(self, feats): print "Starting to train the data" start = datetime.datetime.now() print "setting the ids", datetime.datetime.now() self.negids = movie_reviews.fileids('neg') self.posids = movie_reviews.fileids('pos') #random.shuffle(self.negids) #random.shuffle(self.posids) ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] + ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids]) ##random.shuffle(self.reviews) ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:]) ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4]) print "setting the feats", datetime.datetime.now() self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids] self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids] self.negcutoff = len(self.negfeats) * 3 / 4 self.poscutoff = len(self.posfeats) * 3 / 4 print "setting the train/test", datetime.datetime.now() self.trainfeats = self.negfeats[:self. negcutoff] + self.posfeats[:self. poscutoff] self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[ self.poscutoff:] print "training", datetime.datetime.now() self.classifier = NaiveBayesClassifier.train(self.trainfeats) ##self.classifier = NaiveBayesClassifier.train(self.train_set) self.refsets = defaultdict(set) self.testsets = defaultdict(set) print "accuracy stuff", datetime.datetime.now() for i, (feats, label) in enumerate(self.testfeats): ##for i, (feats, label) in enumerate(self.test_set): self.refsets[label].add(i) observed = self.classifier.classify(feats) self.testsets[observed].add(i) end = datetime.datetime.now() print "Training lasted for ", end - start print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats) ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set) print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos']) print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos']) print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg']) print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg']) self.classifier.show_most_informative_features() self.trained = True
def compare_ranking_precision(resolved_entities): # the total number of entities we have unanimous annotator judgments for total_evaluated = 0 # the total number of entities for which turkers # could not agree upon the correct candidate annotator_disagreement = 0 gold_correct = 0 # the total number of entities for which our algorithms and the # baseline ranking techniques selected the correct candidate reslve_algs_correct = defaultdict(int) # alg id -> # times correct nonmatch_algs_baseline_correct = defaultdict(int) wikiminer_correct = 0 dbpedia_correct = 0 random_correct = 0 toolkit_failures = 0 reslve_success_when_toolkits_fail = 0 for resolved_entity in resolved_entities: gold_standard_candidates = resolved_entity.get_unanimous_candidates_goldstandard() if len(gold_standard_candidates)==0: annotator_disagreement = annotator_disagreement+1 continue # turkers couldn't agree on this entity is_wikiminer_correct = resolved_entity.is_baseline_wikiminer_correct() is_dbpedia_correct = resolved_entity.is_baseline_dbpedia_correct() for alg_id in resolved_entity.reslve_rankings.keys(): # check if RESLVE algorithm selected the correct candidate is_reslve_algs_correct = resolved_entity.is_reslve_correct(alg_id) if is_reslve_algs_correct: reslve_algs_correct[alg_id] = reslve_algs_correct[alg_id]+1 # run the same RESLVE algorithm but use a random non-matching user who # doesn't provide the user interest model we claim is so relevant and # valuable (ie we want to make sure that just incorporating any random # wikipedia data isn't the main reason for any good performance we see) if resolved_entity.is_baseline_reslve_nonmatch_correct(alg_id): nonmatch_algs_baseline_correct[alg_id] = nonmatch_algs_baseline_correct[alg_id]+1 # measure whether when toolkits are wrong, RESLVE can perform correctly if not is_wikiminer_correct and not is_dbpedia_correct: toolkit_failures = toolkit_failures+1 if is_reslve_algs_correct: reslve_success_when_toolkits_fail = reslve_success_when_toolkits_fail+1 # check performance of the base line strategies if is_wikiminer_correct: wikiminer_correct = wikiminer_correct+1 if is_dbpedia_correct: dbpedia_correct = dbpedia_correct+1 if resolved_entity.is_baseline_random_correct(): random_correct = random_correct+1 if resolved_entity.is_goldstandard_correct(): gold_correct = gold_correct+1 total_evaluated = total_evaluated+1 wikiminer_accuracy = float(wikiminer_correct)/float(total_evaluated) print "Wikipedia Miner precision: "+str(wikiminer_accuracy) dbpedia_accuracy = float(dbpedia_correct)/float(total_evaluated) print "DBPedia Spotlight precision: "+str(dbpedia_accuracy) random_accuracy = float(random_correct)/float(total_evaluated) print "Random baseline precision: "+str(random_accuracy) gold_accuracy = float(gold_correct)/float(total_evaluated) print "Human annotator ability to reach consensus: "+str(gold_accuracy) for alg_id in resolved_entity.reslve_rankings.keys(): reslve_correct = reslve_algs_correct[alg_id] reslve_accuracy = float(reslve_correct)/float(total_evaluated) print "RESLVE "+alg_id+" precision: "+str(reslve_accuracy) nonmatch_baseline_correct = nonmatch_algs_baseline_correct[alg_id] nonmatch_baseline_accuracy = float(nonmatch_baseline_correct)/float(total_evaluated) print "RESLVE nonmatch baseline using "+alg_id+" precision: "+str(nonmatch_baseline_accuracy) # improvement achieved by incorporating the user interest model if nonmatch_baseline_correct==0: improvement_str = "Infinite (non match baseline failed to correctly resolve any entity)" else: matching_user_improvement = float(reslve_correct-nonmatch_baseline_correct)/float(nonmatch_baseline_correct) improvement_str = str(matching_user_improvement) print "Improvement boost by incorporating user interest model into RESLVE's "+\ str(alg_id)+": "+str(improvement_str) if toolkit_failures==0: print "Toolkits performed with 100% accuracy.." else: tough_cases_improvement = float(reslve_success_when_toolkits_fail)/float(toolkit_failures) print "RESLVE able to achieve "+str(tough_cases_improvement)+\ " precision in the difficult cases when Wikipedia Miner and DBPedia Spotlight fail completely."