def parse_doc(self): """ Parse a <doc> node in the ast.* instance if there is any """ doc = self.node.find(self.ns(self.NS_CORE, 'doc')) if doc is not None: # need to import Doc here, because of nested usage of Base class, # which causes infinite import loop from Doc import Doc self.doc = Doc(self._namespace, doc)
def load_docs(docs_filename,numOfDoc,vocab,model): cnt=0; docs=[None]*numOfDoc; len_sum=0; for line in open(docs_filename): doc=Doc(line,vocab); doc.init_varational_parameters(vocab,model); len_sum+=len(doc); docs[cnt]=doc; if cnt%1000==0: print "progress:",cnt,"memoery useage:",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000,"time:",datetime.now(); cnt+=1; if cnt>=numOfDoc: break; print "ave length of doc:",float(len_sum)/cnt; return docs;
def load_docs(docs_filename, numOfDoc, vocab, model): cnt = 0 docs = [None] * numOfDoc len_sum = 0 for line in open(docs_filename): doc = Doc(line, vocab) doc.init_varational_parameters(vocab, model) len_sum += len(doc) docs[cnt] = doc if cnt % 1000 == 0: print "progress:", cnt, "memoery useage:", resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 1000, "time:", datetime.now( ) cnt += 1 if cnt >= numOfDoc: break print "ave length of doc:", float(len_sum) / cnt return docs
def __init__(self, docs, gram): self.index = {} self.docs = docs self.gram = gram if self.gram == 2: all_words = unique([inner for outer in docs for inner in docs[outer].words]) new_docs = {} for i in range(len(all_words)): word = "#"+all_words[i]+"#" id = i words = [word[j:j+2] for j in range(len(word) - 1)] doc = Doc(id, ' '.join(words)) doc.words = words new_docs[id] = doc self.docs = new_docs self.create_index()
def extract_csv(filename): docs = {} df = pd.read_csv(filename) for i in range(df.shape[0]): text = df['Title'].values[i] + " " + df['Text'].values[i] if "Tag" in df.columns: tag = df['Tag'].values[i] else: tag = None docs[i] = Doc(i, text, tag) return docs
def __init__(self,filename): self.docs = [] self.vocab_size = 0 self.n_docs = 0 print "reading data from : "+filename f=open(filename) for line in f.readlines(): parts=line.split() doc=Doc() for i,part in enumerate(parts): if i==0: doc.length=int(part) else: word,count=map(int,part.split(":")) doc.words.append(word) doc.word_counts.append(count) doc.total+=count if(word>=self.vocab_size): self.vocab_size=word+1 f.close() self.n_docs=len(self.docs)
def extract_xml(filename): tree = ET.parse(filename) root = tree.getroot() docs = {} for page in root.findall( "{http://www.mediawiki.org/xml/export-0.10/}page"): id = int( page.find("{http://www.mediawiki.org/xml/export-0.10/}id").text) text = page.find("{http://www.mediawiki.org/xml/export-0.10/}revision") \ .find("{http://www.mediawiki.org/xml/export-0.10/}text").text docs[id] = Doc(id, text) return docs
def list_all_entities(): """POST request, generate a response listing the details of all the entities. Args: raw text: the user should post an article (raw text format) to this api. Returns: json: list the details of the entities in the article, in the json format """ article = request.data.decode() my_doc = Doc(article) dic = [] mapping = my_doc.map_position_start_index() for ent in my_doc.get_doc().ents: ent_dic = {} start_index = ent.start_char position = my_doc.get_position(start_index, mapping) label = my_doc.get_label(ent) ent_dic["entity"] = ent.text ent_dic["position"] = position ent_dic["label"] = label dic.append(ent_dic) return jsonify(dic)
def saveDoc(doc: Doc): """Сохранение данных""" # Сдесь почему-то не получилось сохранять # Обьект Doc целиком # Поэтому превращаю словарь в список и сохраняю в файл result = [] for key, figure in doc.all().items(): result.append(figure) output = open(Data.__file_name, 'wb') pickle.dump(result, output) output.close()
def visualization(): """ POST request, can be made via Postman. Notes: the format of the body of the POST request must be 'raw text'. Args: raw text: the body of the POST request should be raw text. Returns: String: Return a URL containing the reference number of this POST, the user can use the URL in the browser to see the visualized entity extract result """ article = request.data.decode() # String my_doc = Doc(article) reference_number = str(uuid.uuid4()) html = displacy.render(my_doc.get_doc(), style="ent") catch.add_reference(reference_number, html) return jsonify({ 'your reference': f"http://{request.host}/get?reference={reference_number}" })
def banco(): c1 = Cliente("Eduardo", 1200.00, 5000.00) c2 = Cliente("Carlos", 2000.00, 900.00) print("Telas saques: ") c1.Sacar(6200.00) print("Saldo do cliente: ", c1.getNome(), " é: ", c1.checarSaldo()) c2.Sacar(100.0) print("Saldo do cliente, ", c2.getNome(), " é: ", c2.checarSaldo()) c2.Sacar(3000.0) d1 = Doc() print( "------------------------ Tela Docs: -------------------------------") print("Saldo inicial do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo inicial do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo()) print( "------------------------ Transferência 1: --------------------------") d1.transferir(c1, c2, 300.00) print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo()) print( "------------------------ Transferência 2: --------------------------") d1.transferir(c1, c2, 30000.00) print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo()) print( "------------------------ Transferência 3: --------------------------") d1.transferir(c1, c2, 5900.00) print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo())
def fetch_and_create_doc(connection, name="Monaco"): print("=================================================================") print("Trying to fetch row and create doc") sql = "SELECT place_id, osm_id, osm_type, name, address, \ country_code, housenumber, postcode from placex \ where name->'name' like '" + name + "' limit 1 " cursor = connection.cursor(cursor_factory=RealDictCursor) cursor.execute(sql) record = cursor.fetchone() print(sql, "\n") # place_id, osm_id, osm_type, name, address, country_code, housenumber, \ # postcode = record.values() doc = Doc(record) print("osm_id:", doc.osm_id) print("osm_type:", doc.osm_type) print("name tags as dictionary:", doc.name) cursor.close() return doc
class Base(object): """ Base Class all ast.* Nodes should be derived from""" NS_CORE = NS_CORE NS_C = NS_C NS_GLIB = NS_GLIB def __init__(self, namespace, node): """ Init a new Ast element, store the parent global namespace instance and etree XML Node and call parse_node(). """ self._namespace = namespace #needed later for type resolvment self.node = node self.parse_node() def parse_node(self): """ Parse the current Node in the derived AST Object Overwrite this method in the concrete implementation """ raise NotImplementedError('"Ast::%s::parse_node()" not implemented in derived class' % self.__class__.__name__) def toObjectRepr(self): """ Collect all parsed informations and return a python object that can be processed later for output Overwrite this method in the concrete implementation """ raise NotImplementedError('"Ast::%s::toObjectRepr()" not implemented in derived class' % self.__class__.__name__) def parse_doc(self): """ Parse a <doc> node in the ast.* instance if there is any """ doc = self.node.find(self.ns(self.NS_CORE, 'doc')) if doc is not None: # need to import Doc here, because of nested usage of Base class, # which causes infinite import loop from Doc import Doc self.doc = Doc(self._namespace, doc) def parse_parameters(self): """ Parse <parameters> node and each subnode <parameter> """ if hasattr(self, 'parameters') is False: self.parameters = [] params = self.node.find(self.ns(self.NS_CORE, 'parameters')) if params is not None: from Parameter import Parameter for param in self.find_children(self.ns(self.NS_CORE, 'parameter'), params): self.parameters.append(Parameter(self._namespace, param)) def parse_attributes_from_map(self, mapping): """ Universal method to parse attributed from a node based on the given parse map. The mapping is a python Dict that looks like mapping = { 'key_to_set_in_ast_instance' : 'name_of_the_xml_attribute' } The key of the mapping Dic will be the property/attribute name of the ast.* instance. If a mapping is {'myCoolVersion' : 'version'} the ast instance will have the propertiy self.myCoolVersion """ for key, attrib in mapping.iteritems(): if isinstance(attrib, tuple): attrib = self.ns(attrib[0], attrib[1]) self.__dict__[key] = self.node.get(attrib) def parse_returnvalue(self): """ Parse the <return-value> node """ rValue = self.node.find(self.ns(self.NS_CORE, 'return-value')) if rValue is not None: # need to import here, because of import loop from ReturnValue import ReturnValue self.returnValue = ReturnValue(self._namespace, rValue) def parse_types(self, node): """ Method to lookup types used as parameters or return values Lookup the type defined in TypeDef.py and return the Type* instance that represents it Parameters: node -- etree XML node to get and parse the type for Returns Returns an instance of ast.Type* based on the type that the input node describes """ if node is None: return node from Type import Type from TypeArray import TypeArray from TypeVarArg import TypeVarArg typenode = node.find(self.ns(self.NS_CORE, 'type')) if typenode is not None: return Type(self._namespace, typenode) arraynode = node.find(self.ns(self.NS_CORE, 'array')) if arraynode is not None: return TypeArray(self._namespace, arraynode) varnode = node.find(self.ns(self.NS_CORE, 'varargs')) if varnode is not None: return TypeVarArg(self._namespace, varnode) def ns(self, ns, tag): """ Universal function to generate NS string for XML usage Arguments: ns -- The namespace string tag -- The tag name Returns: The namespace string as "{%ns}%tag" """ return "{%s}%s" % (ns, tag) def find_children(self, tag, node=None): """ Find direct children of the input etree node or the self.node etree object and return all children that match the tag name Arguments: tag -- Name of the tag to find as direct child (string) node -- an etree() node, if not present it will use self.node Returns: An array of etree node objects """ if node is None: node = self.node return [c for c in node.getchildren() if c.tag == tag] def getNamepspaceTag(self): """ Return the global main ast.Namespace.Namespace Object Returns Instance of ast.Namespace.Namespace """ return self._namespace def getName(self): """ Return the value of self.name which should be set in every ast instance Returns: String """ assert self.name return self.name def getVersion(self): """ Return the self.version value if present If not present, return an empty string Returns: String """ if self.version is None: return '' return self.version def getType(self): """ Return ast.Type* instance, if there is any type object This will only availbe in Parameters, returnValue (etc) instances where a <type> node exists Returns None or ast.Type* instance """ return self.type def getCType(self): """ Return the Ctype, which is the attribute "c:type" from the XML node Returns None or String """ if hasattr(self, 'ctype'): return self.ctype return None def getDoc(self, asObjectRepr=False): """ Get the <doc> ast.Doc instance for this ast instance if there is any If called with Argument asObjectRepr=True it will directly call toObjectRepr() on the ast.Doc instance. Arguments: asObjectRepr -- true if directly call toObjectRepr() on ast.Doc instance (default False) Returns String or ast.Doc instance """ if hasattr(self, 'doc'): if asObjectRepr is True: return self.doc.toObjectRepr() else: return self.doc return '' def getParentTree(self, result=None): """Get the Parent Tree for the current node if possible. Maybe only available for <class> nodes and derived ast.Klass implementations Returns: None or an sorted array with parents """ if self._parent is None: return None if result is None: result = [self.name] else: result.append(self.name) if isinstance(self._parent, Base): self._parent.getParentTree(result) elif isinstance(self._parent, str): result.append(self._parent) else: raise ValueException('Unknown Type found in getParentTree()') return result
def main(): parser = argparse.ArgumentParser() parser.add_argument('--order', type=int) args = parser.parse_args() foodList1 = ['hot', 'chocolate', 'milk', 'taco', 'sandwich', 'sushi', 'schnitzel', 'naan', 'croissant', 'bulle'] vsp = VSP.VectorSpaceModel() vsp.loadDict(foodList1) ordered = False if args.order == 1: ordered=True doc1str = ' hot chocolate milk sandwich' doc1 = Doc(doc1str) doc1.vectorify(vsp) doc1Vec = doc1.getVector() doc2str = 'hot milk sandwich sushi naan' doc2 = Doc(doc2str) doc2.vectorify(vsp) doc2Vec = doc2.getVector() query1str = 'milk hot sushi' query1 = Doc(query1str) query1.vectorify(vsp, ordered=ordered) query1Vec = query1.getVector() score1 = vsp.dotProduct(query1Vec, doc1Vec) score2 = vsp.dotProduct(query1Vec, doc2Vec) # score3 = vsp.dotProduct(query1Vec, doc3Vec) query2str = 'hot milk sushi' query2 = Doc(query2str) query2.vectorify(vsp, ordered=ordered) query2Vec = query2.getVector() score21 = vsp.dotProduct(query2Vec, doc1Vec) score22 = vsp.dotProduct(query2Vec, doc2Vec) # print doc1Vec, doc2Vec # print query1Vec, query2Vec print "Dictionary:", foodList1 print "Document 1:", doc1str print "Document 2:", doc2str print "Query:", query1str print "Score for doc1:", score1 print "Score for doc2:", score2 print "Query 2:", query2str print "Score for doc1:", score21 print "Score for doc2:", score22 scoreMap_Q1 = {} scoreMap_Q2 = {} # the key-map pair below can be stored in a persistance layer for a fast lookup ... scoreMap_Q1[doc1.getDocNum()] = score1 scoreMap_Q1[doc2.getDocNum()] = score2 scoreMap_Q1_sorted = sortScoresWithOrder(scoreMap_Q1, desc=True) scoreMap_Q2_sorted = sortScoresWithOrder(scoreMap_Q2) # print scoreMap_Q1_sorted print "For query1- score sheet:" for k,v in scoreMap_Q1_sorted: print('DocNumber: %i, score:%5.4f' %(k,v))
docs = read_docs('../data/English.csv') preprocessor = EnglishPreprocessor(docs) else: docs = read_docs('../data/Persian.xml') preprocessor = PersianPreprocessor(docs) for doc in docs.values(): doc.words = preprocessor.preprocess(doc.text) print("Preprocess is done!") index = PositionalIndexer(docs, 1).index print("Index Created Successfully!") query = input("Enter Query: ") q_doc = Doc(0, query) q_doc.words = preprocessor.preprocess(q_doc.text) query_tag = input("Enter Tag (1, 2, 3, 4, None): ") tag = None if query_tag in ["1", "2", "3", "4"]: tag = int(query_tag) if tag is not None: classify(docs) results = search(q_doc, docs, index, 10, query_tag) for result in results: print(result[1]) print(result[0].text) print()
def parseCorpus(self, docToVerifiedSentences): # maps the long, original REF names to a small, more readable REF ID REFToUREF = {} UREF = 0 print("* parsing ECB corpus:", self.args.corpusPath) numMentionsIgnored = 0 corpus = Corpus() files = [] filteredTrainingDirs = self.helper.trainingDirs[0:self.args.devDir] print("filteredTrainingDirs:", filteredTrainingDirs) for root, _, filenames in os.walk(self.args.corpusPath): for filename in fnmatch.filter(filenames, '*.xml'): f = os.path.join(root, filename) doc_id = f[f.rfind("/") + 1:] dir_num = int(doc_id.split("_")[0]) if dir_num in self.helper.trainingDirs and dir_num not in filteredTrainingDirs: continue files.append(os.path.join(root, filename)) globalSentenceNum = 0 lastToken_id = -1 intraCount = 0 # used for keeping track of how many mentions were pronouns had_pronoun = 0 not_had_pronoun = 0 num_events_with_pronouns = 0 for f in sorted(files): lm_idToMention = {} # only used to tmp store the mentions removed_m_ids = set() # keeps track of the mentions that had pronouns and we removed (if we care to remove them) doc_id = f[f.rfind("/") + 1:] dir_num = int(doc_id.split("_")[0]) extension = doc_id[doc_id.find("ecb"):] dirHalf = str(dir_num) + extension curDoc = Doc(doc_id) corpus.ECBDirs[dir_num].docs[doc_id] = curDoc corpus.dirHalves[dirHalf].docs[doc_id] = curDoc tmpDocTokens = [] tmpDocTokenIDsToTokens = {} # opens the xml file and makes needed replacements input_file = open(f, 'r', encoding="utf-8") #with open(f, 'r', encoding="utf-8") as myfile: fileContents = input_file.read().replace('\n', ' ') for badToken in self.replacementsList: # self.replacementsSet: fileContents = fileContents.replace(badToken, self.replacements[badToken]) # reads <tokens> it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\" number=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents)) lastSentenceNum = -1 if self.write_stanford_input: tmp_line_to_stanford_input = defaultdict(list) # numbers every token in each given sentence, starting at 1 (each sentence starts at 1) tokenNum = 0 firstToken = True lastTokenText = "" for match in it: t_id = match.group(1) sentenceNum = int(match.group(2)) hTokenNum = int(match.group(3)) # only used for matching w/ HDDCRP's files #tokenText = match.group(4).rstrip() # should be used if i'll write out hte corpus for Stan tokenText = match.group(4).lower().rstrip() # removes tokens that end in : (e.g., newspaper:) but leaves the atomic ":" alone if len(tokenText) > 1 and tokenText[-1] == ":": tokenText = tokenText[:-1] if tokenText == "''": tokenText = "\"" elif tokenText == "''bagman\"": tokenText = "\"bagman\"" print("* replaced bagman1") elif tokenText == "''bagman": tokenText = "\"bagman" print("* replaced bagman2") if sentenceNum > curDoc.highestSentenceNum: curDoc.highestSentenceNum = sentenceNum if sentenceNum > 0 or "plus" not in doc_id: # writes Stanford_input if self.write_stanford_input: tmp_line_to_stanford_input[int(sentenceNum)].append(match.group(4).rstrip()) hSentenceNum = sentenceNum if "plus" in doc_id: hSentenceNum = sentenceNum - 1 # TMP ''' if sentenceNum not in tmpSentenceNums: tmpSentenceNums.append(sentenceNum) ''' # we are starting a new sentence if sentenceNum != lastSentenceNum: # we are possibly ending the prev sentence if not firstToken: # if sentence ended with an atomic ":", let's change it to a "." if lastTokenText == ":": lastToken = tmpDocTokenIDsToTokens[lastToken_id] lastToken.text = "." tmpDocTokenIDsToTokens[lastToken_id] = lastToken elif lastTokenText not in self.endPunctuation: endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, ".") tmpDocTokens.append(endToken) globalSentenceNum = globalSentenceNum + 1 tokenNum = 0 # adds token curToken = Token(t_id, sentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, tokenText) #corpus.UIDToToken[curToken.UID] = curToken #curDoc.UIDs.append(curToken.UID) tmpDocTokenIDsToTokens[t_id] = curToken firstToken = False tmpDocTokens.append(curToken) tokenNum = tokenNum + 1 curDoc.globalSentenceNums.add(globalSentenceNum) lastSentenceNum = sentenceNum lastTokenText = tokenText lastToken_id = t_id if self.write_stanford_input: tmpFOUT = open("../data/stanford_in/"+doc_id, "w") for sent_num in sorted(tmp_line_to_stanford_input.keys()): tmpFOUT.write(" ".join(tmp_line_to_stanford_input[sent_num]) + "\n") tmpFOUT.close() # if sentence ended with an atomic ":", let's change it to a "." if lastTokenText == ":": lastToken = tmpDocTokenIDsToTokens[lastToken_id] lastToken.text = "." tmpDocTokenIDsToTokens[lastToken_id] = lastToken elif lastTokenText not in self.endPunctuation: endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, -1, -1, ".") tmpDocTokens.append(endToken) globalSentenceNum = globalSentenceNum + 1 # reads <markables> 1st time regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) tmpCurrentMentionSpanIDs = [] hasAllTokens = True for match2 in it2: tokenID = match2.group(1) tmpCurrentMentionSpanIDs.append(int(tokenID)) if tokenID not in tmpDocTokenIDsToTokens.keys(): hasAllTokens = False for t in tmpDocTokens: corpus.addToken(t) curDoc.tokens.append(t) corpus.UIDToToken[t.UID] = t #if doc_id == "31_3ecbplus.xml": # print("t:",t) # reads <markables> 2nd time regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: isPred = False mentionType = match.group(1) if "ACTION" in mentionType: isPred = True m_id = int(match.group(2)) # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) tmpTokens = [] text = [] hasAllTokens = True has_pronoun = False for match2 in it2: tokenID = match2.group(1) if tokenID in tmpDocTokenIDsToTokens.keys(): cur_token = tmpDocTokenIDsToTokens[tokenID] tmpTokens.append(cur_token) text.append(cur_token.text) else: hasAllTokens = False # only process Mentions if they adhere to our preferences of using pronouns or not # determines if it has a pronoun or not (and if we care) if len(text) == 1: if text[0] in self.helper.pronouns: has_pronoun = True if has_pronoun: had_pronoun += 1 if isPred: num_events_with_pronouns += 1 else: not_had_pronoun += 1 # possibly add the mention use_pronoun = False if isPred: use_pronoun = self.helper.event_pronouns else: use_pronoun = self.helper.entity_pronouns use_mention = True if not use_pronoun and has_pronoun: use_mention = False #print("* not constructing mention:", text) removed_m_ids.add(m_id) # we should only have incomplete Mentions for our hand-curated, sample corpus, # for we do not want to have all mentions, so we curtail the sentences of tokens if hasAllTokens and use_mention: curMention = Mention(dirHalf, dir_num, doc_id, tmpTokens, text, isPred, mentionType) lm_idToMention[m_id] = curMention #tmpSentenceNumToMentions[tmpTokens[0].sentenceNum].append(curMention) #corpus.addMention(curMention, "123") # reads <relations> relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")] regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: REF = match.group(1) regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(2))) # only keep track of REFs for which we have found Mentions for match2 in it2: m_id = int(match2.group(1)) if m_id not in lm_idToMention: if m_id not in removed_m_ids: print("*** MISSING MENTION! EXITING 1") exit(1) else: #elif lm_idToMention[m_id].isPred: foundMention = lm_idToMention[m_id] if self.onlyEvents and not foundMention.isPred: continue token0 = foundMention.tokens[0] if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]: numMentionsIgnored += 1 continue else: corpus.addMention(foundMention, REF) if self.args.addIntraDocs: regex = r"<INTRA_DOC_COREF.*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(1))) # only keep track of REFs for which we have found Mentions for match2 in it2: m_id = int(match2.group(1)) if m_id not in lm_idToMention: print("*** MISSING MENTION! EXITING 2") exit(1) else: foundMention = lm_idToMention[m_id] if self.onlyEvents and not foundMention.isPred: continue token0 = foundMention.tokens[0] if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]: numMentionsIgnored += 1 continue else: corpus.addMention(foundMention, "INTRA"+str(intraCount)) intraCount += 1 corpus.addDocPointer(doc_id, curDoc) # optionally displays annotations (Mentions clearly designated w/ unique REF IDs#) if self.printCorpusTokens: print("\n------------------\ndoc:",doc_id,"\n------------------") sent_num = -1 oline = "" lastMentions = set() for t in curDoc.tokens: if t.sentenceNum != sent_num and sent_num != -1: sent_num = t.sentenceNum print(oline) oline = "" added = False removed = False urefToAdd = -1 entOrEventToAdd = "" for m in t.mentions: if m not in lastMentions: if m.REF in REFToUREF.keys(): urefToAdd = REFToUREF[m.REF] else: urefToAdd = UREF REFToUREF[m.REF] = UREF UREF += 1 if m.isPred: entOrEventToAdd = "v" else: entOrEventToAdd = "ent" added = True if len(lastMentions) > 0: for m in lastMentions: if m not in t.mentions: removed = True if removed: oline += "] " if added: if len(oline) > 0 and oline[-1] != " ": oline += " " oline += str(entOrEventToAdd) + str(urefToAdd) + "[" if len(oline) > 0 and oline[-1] != " " and oline[-1] != "[": oline += " " oline += str(t.text) lastMentions = t.mentions print(oline) corpus.assignGlobalSentenceNums() print("numMentionsIgnored:", numMentionsIgnored) print("# ECB mentions created:", len(corpus.ecb_mentions)) num_events = 0 for m in corpus.ecb_mentions: if m.isPred: num_events += 1 print("\t# events:", num_events) print("\t\t# of event which had pronouns:", num_events_with_pronouns) print("\t# entities:", len(corpus.ecb_mentions) - num_events) print("# ECB+ tokens:", len(corpus.corpusTokens)) print("# mentions that had_pronoun:", had_pronoun) print("# mentions that did not had_pronoun:", not_had_pronoun) return corpus