def list_all_entities(): """POST request, generate a response listing the details of all the entities. Args: raw text: the user should post an article (raw text format) to this api. Returns: json: list the details of the entities in the article, in the json format """ article = request.data.decode() my_doc = Doc(article) dic = [] mapping = my_doc.map_position_start_index() for ent in my_doc.get_doc().ents: ent_dic = {} start_index = ent.start_char position = my_doc.get_position(start_index, mapping) label = my_doc.get_label(ent) ent_dic["entity"] = ent.text ent_dic["position"] = position ent_dic["label"] = label dic.append(ent_dic) return jsonify(dic)
def parse_doc(self): """ Parse a <doc> node in the ast.* instance if there is any """ doc = self.node.find(self.ns(self.NS_CORE, 'doc')) if doc is not None: # need to import Doc here, because of nested usage of Base class, # which causes infinite import loop from Doc import Doc self.doc = Doc(self._namespace, doc)
def extract_csv(filename): docs = {} df = pd.read_csv(filename) for i in range(df.shape[0]): text = df['Title'].values[i] + " " + df['Text'].values[i] if "Tag" in df.columns: tag = df['Tag'].values[i] else: tag = None docs[i] = Doc(i, text, tag) return docs
def extract_xml(filename): tree = ET.parse(filename) root = tree.getroot() docs = {} for page in root.findall( "{http://www.mediawiki.org/xml/export-0.10/}page"): id = int( page.find("{http://www.mediawiki.org/xml/export-0.10/}id").text) text = page.find("{http://www.mediawiki.org/xml/export-0.10/}revision") \ .find("{http://www.mediawiki.org/xml/export-0.10/}text").text docs[id] = Doc(id, text) return docs
def load_docs(docs_filename, numOfDoc, vocab, model): cnt = 0 docs = [None] * numOfDoc len_sum = 0 for line in open(docs_filename): doc = Doc(line, vocab) doc.init_varational_parameters(vocab, model) len_sum += len(doc) docs[cnt] = doc if cnt % 1000 == 0: print "progress:", cnt, "memoery useage:", resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 1000, "time:", datetime.now( ) cnt += 1 if cnt >= numOfDoc: break print "ave length of doc:", float(len_sum) / cnt return docs
def __init__(self, docs, gram): self.index = {} self.docs = docs self.gram = gram if self.gram == 2: all_words = unique([inner for outer in docs for inner in docs[outer].words]) new_docs = {} for i in range(len(all_words)): word = "#"+all_words[i]+"#" id = i words = [word[j:j+2] for j in range(len(word) - 1)] doc = Doc(id, ' '.join(words)) doc.words = words new_docs[id] = doc self.docs = new_docs self.create_index()
def fetch_and_create_doc(connection, name="Monaco"): print("=================================================================") print("Trying to fetch row and create doc") sql = "SELECT place_id, osm_id, osm_type, name, address, \ country_code, housenumber, postcode from placex \ where name->'name' like '" + name + "' limit 1 " cursor = connection.cursor(cursor_factory=RealDictCursor) cursor.execute(sql) record = cursor.fetchone() print(sql, "\n") # place_id, osm_id, osm_type, name, address, country_code, housenumber, \ # postcode = record.values() doc = Doc(record) print("osm_id:", doc.osm_id) print("osm_type:", doc.osm_type) print("name tags as dictionary:", doc.name) cursor.close() return doc
def __init__(self,filename): self.docs = [] self.vocab_size = 0 self.n_docs = 0 print "reading data from : "+filename f=open(filename) for line in f.readlines(): parts=line.split() doc=Doc() for i,part in enumerate(parts): if i==0: doc.length=int(part) else: word,count=map(int,part.split(":")) doc.words.append(word) doc.word_counts.append(count) doc.total+=count if(word>=self.vocab_size): self.vocab_size=word+1 f.close() self.n_docs=len(self.docs)
def banco(): c1 = Cliente("Eduardo", 1200.00, 5000.00) c2 = Cliente("Carlos", 2000.00, 900.00) print("Telas saques: ") c1.Sacar(6200.00) print("Saldo do cliente: ", c1.getNome(), " é: ", c1.checarSaldo()) c2.Sacar(100.0) print("Saldo do cliente, ", c2.getNome(), " é: ", c2.checarSaldo()) c2.Sacar(3000.0) d1 = Doc() print( "------------------------ Tela Docs: -------------------------------") print("Saldo inicial do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo inicial do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo()) print( "------------------------ Transferência 1: --------------------------") d1.transferir(c1, c2, 300.00) print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo()) print( "------------------------ Transferência 2: --------------------------") d1.transferir(c1, c2, 30000.00) print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo()) print( "------------------------ Transferência 3: --------------------------") d1.transferir(c1, c2, 5900.00) print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ", c1.checarSaldo()) print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ", c2.checarSaldo())
def visualization(): """ POST request, can be made via Postman. Notes: the format of the body of the POST request must be 'raw text'. Args: raw text: the body of the POST request should be raw text. Returns: String: Return a URL containing the reference number of this POST, the user can use the URL in the browser to see the visualized entity extract result """ article = request.data.decode() # String my_doc = Doc(article) reference_number = str(uuid.uuid4()) html = displacy.render(my_doc.get_doc(), style="ent") catch.add_reference(reference_number, html) return jsonify({ 'your reference': f"http://{request.host}/get?reference={reference_number}" })
docs = read_docs('../data/English.csv') preprocessor = EnglishPreprocessor(docs) else: docs = read_docs('../data/Persian.xml') preprocessor = PersianPreprocessor(docs) for doc in docs.values(): doc.words = preprocessor.preprocess(doc.text) print("Preprocess is done!") index = PositionalIndexer(docs, 1).index print("Index Created Successfully!") query = input("Enter Query: ") q_doc = Doc(0, query) q_doc.words = preprocessor.preprocess(q_doc.text) query_tag = input("Enter Tag (1, 2, 3, 4, None): ") tag = None if query_tag in ["1", "2", "3", "4"]: tag = int(query_tag) if tag is not None: classify(docs) results = search(q_doc, docs, index, 10, query_tag) for result in results: print(result[1]) print(result[0].text) print()
def parseCorpus(self, docToVerifiedSentences): # maps the long, original REF names to a small, more readable REF ID REFToUREF = {} UREF = 0 print("* parsing ECB corpus:", self.args.corpusPath) numMentionsIgnored = 0 corpus = Corpus() files = [] filteredTrainingDirs = self.helper.trainingDirs[0:self.args.devDir] print("filteredTrainingDirs:", filteredTrainingDirs) for root, _, filenames in os.walk(self.args.corpusPath): for filename in fnmatch.filter(filenames, '*.xml'): f = os.path.join(root, filename) doc_id = f[f.rfind("/") + 1:] dir_num = int(doc_id.split("_")[0]) if dir_num in self.helper.trainingDirs and dir_num not in filteredTrainingDirs: continue files.append(os.path.join(root, filename)) globalSentenceNum = 0 lastToken_id = -1 intraCount = 0 # used for keeping track of how many mentions were pronouns had_pronoun = 0 not_had_pronoun = 0 num_events_with_pronouns = 0 for f in sorted(files): lm_idToMention = {} # only used to tmp store the mentions removed_m_ids = set() # keeps track of the mentions that had pronouns and we removed (if we care to remove them) doc_id = f[f.rfind("/") + 1:] dir_num = int(doc_id.split("_")[0]) extension = doc_id[doc_id.find("ecb"):] dirHalf = str(dir_num) + extension curDoc = Doc(doc_id) corpus.ECBDirs[dir_num].docs[doc_id] = curDoc corpus.dirHalves[dirHalf].docs[doc_id] = curDoc tmpDocTokens = [] tmpDocTokenIDsToTokens = {} # opens the xml file and makes needed replacements input_file = open(f, 'r', encoding="utf-8") #with open(f, 'r', encoding="utf-8") as myfile: fileContents = input_file.read().replace('\n', ' ') for badToken in self.replacementsList: # self.replacementsSet: fileContents = fileContents.replace(badToken, self.replacements[badToken]) # reads <tokens> it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\" number=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents)) lastSentenceNum = -1 if self.write_stanford_input: tmp_line_to_stanford_input = defaultdict(list) # numbers every token in each given sentence, starting at 1 (each sentence starts at 1) tokenNum = 0 firstToken = True lastTokenText = "" for match in it: t_id = match.group(1) sentenceNum = int(match.group(2)) hTokenNum = int(match.group(3)) # only used for matching w/ HDDCRP's files #tokenText = match.group(4).rstrip() # should be used if i'll write out hte corpus for Stan tokenText = match.group(4).lower().rstrip() # removes tokens that end in : (e.g., newspaper:) but leaves the atomic ":" alone if len(tokenText) > 1 and tokenText[-1] == ":": tokenText = tokenText[:-1] if tokenText == "''": tokenText = "\"" elif tokenText == "''bagman\"": tokenText = "\"bagman\"" print("* replaced bagman1") elif tokenText == "''bagman": tokenText = "\"bagman" print("* replaced bagman2") if sentenceNum > curDoc.highestSentenceNum: curDoc.highestSentenceNum = sentenceNum if sentenceNum > 0 or "plus" not in doc_id: # writes Stanford_input if self.write_stanford_input: tmp_line_to_stanford_input[int(sentenceNum)].append(match.group(4).rstrip()) hSentenceNum = sentenceNum if "plus" in doc_id: hSentenceNum = sentenceNum - 1 # TMP ''' if sentenceNum not in tmpSentenceNums: tmpSentenceNums.append(sentenceNum) ''' # we are starting a new sentence if sentenceNum != lastSentenceNum: # we are possibly ending the prev sentence if not firstToken: # if sentence ended with an atomic ":", let's change it to a "." if lastTokenText == ":": lastToken = tmpDocTokenIDsToTokens[lastToken_id] lastToken.text = "." tmpDocTokenIDsToTokens[lastToken_id] = lastToken elif lastTokenText not in self.endPunctuation: endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, ".") tmpDocTokens.append(endToken) globalSentenceNum = globalSentenceNum + 1 tokenNum = 0 # adds token curToken = Token(t_id, sentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, tokenText) #corpus.UIDToToken[curToken.UID] = curToken #curDoc.UIDs.append(curToken.UID) tmpDocTokenIDsToTokens[t_id] = curToken firstToken = False tmpDocTokens.append(curToken) tokenNum = tokenNum + 1 curDoc.globalSentenceNums.add(globalSentenceNum) lastSentenceNum = sentenceNum lastTokenText = tokenText lastToken_id = t_id if self.write_stanford_input: tmpFOUT = open("../data/stanford_in/"+doc_id, "w") for sent_num in sorted(tmp_line_to_stanford_input.keys()): tmpFOUT.write(" ".join(tmp_line_to_stanford_input[sent_num]) + "\n") tmpFOUT.close() # if sentence ended with an atomic ":", let's change it to a "." if lastTokenText == ":": lastToken = tmpDocTokenIDsToTokens[lastToken_id] lastToken.text = "." tmpDocTokenIDsToTokens[lastToken_id] = lastToken elif lastTokenText not in self.endPunctuation: endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, -1, -1, ".") tmpDocTokens.append(endToken) globalSentenceNum = globalSentenceNum + 1 # reads <markables> 1st time regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) tmpCurrentMentionSpanIDs = [] hasAllTokens = True for match2 in it2: tokenID = match2.group(1) tmpCurrentMentionSpanIDs.append(int(tokenID)) if tokenID not in tmpDocTokenIDsToTokens.keys(): hasAllTokens = False for t in tmpDocTokens: corpus.addToken(t) curDoc.tokens.append(t) corpus.UIDToToken[t.UID] = t #if doc_id == "31_3ecbplus.xml": # print("t:",t) # reads <markables> 2nd time regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: isPred = False mentionType = match.group(1) if "ACTION" in mentionType: isPred = True m_id = int(match.group(2)) # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) tmpTokens = [] text = [] hasAllTokens = True has_pronoun = False for match2 in it2: tokenID = match2.group(1) if tokenID in tmpDocTokenIDsToTokens.keys(): cur_token = tmpDocTokenIDsToTokens[tokenID] tmpTokens.append(cur_token) text.append(cur_token.text) else: hasAllTokens = False # only process Mentions if they adhere to our preferences of using pronouns or not # determines if it has a pronoun or not (and if we care) if len(text) == 1: if text[0] in self.helper.pronouns: has_pronoun = True if has_pronoun: had_pronoun += 1 if isPred: num_events_with_pronouns += 1 else: not_had_pronoun += 1 # possibly add the mention use_pronoun = False if isPred: use_pronoun = self.helper.event_pronouns else: use_pronoun = self.helper.entity_pronouns use_mention = True if not use_pronoun and has_pronoun: use_mention = False #print("* not constructing mention:", text) removed_m_ids.add(m_id) # we should only have incomplete Mentions for our hand-curated, sample corpus, # for we do not want to have all mentions, so we curtail the sentences of tokens if hasAllTokens and use_mention: curMention = Mention(dirHalf, dir_num, doc_id, tmpTokens, text, isPred, mentionType) lm_idToMention[m_id] = curMention #tmpSentenceNumToMentions[tmpTokens[0].sentenceNum].append(curMention) #corpus.addMention(curMention, "123") # reads <relations> relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")] regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: REF = match.group(1) regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(2))) # only keep track of REFs for which we have found Mentions for match2 in it2: m_id = int(match2.group(1)) if m_id not in lm_idToMention: if m_id not in removed_m_ids: print("*** MISSING MENTION! EXITING 1") exit(1) else: #elif lm_idToMention[m_id].isPred: foundMention = lm_idToMention[m_id] if self.onlyEvents and not foundMention.isPred: continue token0 = foundMention.tokens[0] if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]: numMentionsIgnored += 1 continue else: corpus.addMention(foundMention, REF) if self.args.addIntraDocs: regex = r"<INTRA_DOC_COREF.*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(1))) # only keep track of REFs for which we have found Mentions for match2 in it2: m_id = int(match2.group(1)) if m_id not in lm_idToMention: print("*** MISSING MENTION! EXITING 2") exit(1) else: foundMention = lm_idToMention[m_id] if self.onlyEvents and not foundMention.isPred: continue token0 = foundMention.tokens[0] if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]: numMentionsIgnored += 1 continue else: corpus.addMention(foundMention, "INTRA"+str(intraCount)) intraCount += 1 corpus.addDocPointer(doc_id, curDoc) # optionally displays annotations (Mentions clearly designated w/ unique REF IDs#) if self.printCorpusTokens: print("\n------------------\ndoc:",doc_id,"\n------------------") sent_num = -1 oline = "" lastMentions = set() for t in curDoc.tokens: if t.sentenceNum != sent_num and sent_num != -1: sent_num = t.sentenceNum print(oline) oline = "" added = False removed = False urefToAdd = -1 entOrEventToAdd = "" for m in t.mentions: if m not in lastMentions: if m.REF in REFToUREF.keys(): urefToAdd = REFToUREF[m.REF] else: urefToAdd = UREF REFToUREF[m.REF] = UREF UREF += 1 if m.isPred: entOrEventToAdd = "v" else: entOrEventToAdd = "ent" added = True if len(lastMentions) > 0: for m in lastMentions: if m not in t.mentions: removed = True if removed: oline += "] " if added: if len(oline) > 0 and oline[-1] != " ": oline += " " oline += str(entOrEventToAdd) + str(urefToAdd) + "[" if len(oline) > 0 and oline[-1] != " " and oline[-1] != "[": oline += " " oline += str(t.text) lastMentions = t.mentions print(oline) corpus.assignGlobalSentenceNums() print("numMentionsIgnored:", numMentionsIgnored) print("# ECB mentions created:", len(corpus.ecb_mentions)) num_events = 0 for m in corpus.ecb_mentions: if m.isPred: num_events += 1 print("\t# events:", num_events) print("\t\t# of event which had pronouns:", num_events_with_pronouns) print("\t# entities:", len(corpus.ecb_mentions) - num_events) print("# ECB+ tokens:", len(corpus.corpusTokens)) print("# mentions that had_pronoun:", had_pronoun) print("# mentions that did not had_pronoun:", not_had_pronoun) return corpus