def __init__(self): self.parser = Parser() self.np_tf = {} self.np_itf = {} self.tfidf = {} self.definitions = [] self.parse_log = {}
class Annotator(): counter = 0 def __init__(self): self.parser = Parser() self.np_tf = {} self.np_itf = {} self.tfidf = {} self.definitions = [] self.parse_log = {} def indexConcepts(self, id, nps): if not id in self.np_tf.keys() : self.np_tf[id] = {} for entry in nps : nouns = entry['nouns'] concept = entry['concept'] nouns.append(concept) for np in nouns : if np in self.np_tf[id].keys() : self.np_tf[id][np] += 1 else : self.np_tf[id][np] = 1 if not np in self.np_itf.keys(): self.np_itf[np] = {} if id in self.np_itf[np].keys(): self.np_itf[np][id] += 1 else : self.np_itf[np][id] = 1 def scoreConcepts(self): # Get the total number of documents doc_count = len(self.np_tf.keys()) self.tfidf['count'] = doc_count self.tfidf['nps'] = {} self.tfidf['docs'] = {} # For every noun phrase in the inverse term frequency dictionary for np in self.np_itf : # Get the total number of documents (articles) in which this noun phrase occurs np_total_doc_count = len(self.np_itf[np].keys()) # Calculate the inverse document frequency (IDF) as the natural logarithm of the total # number of documents divided by the number of documents containing this noun phrase idf = math.log(float(doc_count)/float(np_total_doc_count)) print np,idf # Initialise the TFIDF for this noun phrase self.tfidf['nps'][np] = {} # For every document in which this noun phrase occurs for doc_id in self.np_itf[np] : # Initialise the TFIDF per noun phrase for this document self.tfidf['nps'][np][doc_id] = {} # Initialise the TFIDF per document if not doc_id in self.tfidf['docs'].keys() : self.tfidf['docs'][doc_id] = {} # Initialise the TFIDF per document for this noun phrase self.tfidf['docs'][doc_id][np] = {} # Get the number of times this noun phrase occurs in the document np_occ_count = self.np_itf[np][doc_id] # Get the count for the noun phrase that occurs the most in this document. max_np_count = 0 for onp in self.np_tf[doc_id] : if self.np_tf[doc_id][onp] > max_np_count : max_np_count = self.np_tf[doc_id][onp] # Calculate the normalized term frequency as the number of times the noun phrase occurs, divided by the maximum # number of times any noun phrase occurs in this document tf = float(np_occ_count) / float(max_np_count) # Calculate TFIDF tfidf = tf*idf self.tfidf['nps'][np][doc_id]['tc'] = np_occ_count self.tfidf['nps'][np][doc_id]['tf'] = tf self.tfidf['nps'][np][doc_id]['idf'] = idf self.tfidf['nps'][np][doc_id]['max'] = max_np_count self.tfidf['nps'][np][doc_id]['tfidf'] = tfidf self.tfidf['nps'][np][doc_id]['dc'] = doc_count self.tfidf['nps'][np][doc_id]['ndc'] = np_total_doc_count self.tfidf['docs'][doc_id][np]['tc'] = np_occ_count self.tfidf['docs'][doc_id][np]['tf'] = tf self.tfidf['docs'][doc_id][np]['idf'] = idf self.tfidf['docs'][doc_id][np]['max'] = max_np_count self.tfidf['docs'][doc_id][np]['tfidf'] = tfidf self.tfidf['docs'][doc_id][np]['dc'] = doc_count self.tfidf['docs'][doc_id][np]['ndc'] = np_total_doc_count def getConcepts(self, definitions): concepts = [] for (id, num, scope, term, mod, definition, stext) in definitions : concepts.append(term) return concepts def annotate(self, id, text): print "== Annotating {} ==".format(id) print "Tokenizing..." tokenized = self.parser.tokenizeText(text) print "Tagging..." tagged = self.parser.tagText(tokenized) print "Parsing..." parsed = self.parser.parseText(tagged) self.parse_log[id] = parsed stdout.write("Scanning for definitions...") dm = DefinitionMatcher() definitions_for_id = dm.match(id, tagged) stdout.write(" {} found.\n".format(len(definitions_for_id))) self.definitions.extend(definitions_for_id) stdout.write("Extracting concepts...") concepts = Util.extractConcepts(parsed) stdout.write(" {} found.\n".format(len(concepts))) # NB: This does not currently work, as the concepts from definitions should be in a dictionary # stdout.write("Appending concepts from definitions to concept list...") # concepts.extend(self.getConcepts(self.definitions)) # stdout.write(" {} total.\n".format(len(definitions_for_id))) if len(concepts) < 1 : print "No concepts found..." else : print "Adding concepts to index..." self.indexConcepts(id, concepts) # print "Linking concepts to Cornetto Wordnet..." # cl = ConceptLinker() # cl.link(id, concepts) print "=== NEXT ===\n\n" return definitions_for_id