def disambiguateEntity(candidates, weights,resolvedEntities, factorWeights, maxCount, currentId, limit): if len(candidates): max_score=limit aging_factor=0.01 best_candidate=None if currentId in resolvedEntities: del resolvedEntities[str(currentId)] candidates=normalizeTPs(candidates) for cand in candidates: candidate=cand[0] ss=cand[1]["ss"] associativeness=cand[1]["count"]/maxCount # normalizationFactor=maxCoherence(weights, min(10,len(resolvedEntities))) normalizationFactor=1.0 coherence=computeCoherence(candidate, resolvedEntities, weights)/normalizationFactor lastId=getPreviousOccurrence(utils.normalizeURL(candidate), resolvedEntities, currentId-1) recency=0.0 if lastId>-1: age=abs(currentId-lastId) recency=(1-aging_factor)**age temporalPopularity=cand[1]["tp"] score=factorWeights['wss']*ss+factorWeights['wc']*coherence+factorWeights["wa"]*associativeness+factorWeights['wr']*recency+factorWeights['wt']*temporalPopularity #print("%s\tSCORE: %f\tSS: %f\tCoh: %f\tAssoc: %f\tRecency: %f" % (cand[0], score, ss, coherence, associativeness, recency)) if score>limit and (score>max_score or (score==max_score and len(candidate)<len(best_candidate))) and not isDisambiguation(candidate): max_score=score best_candidate=candidate return utils.normalizeURL(best_candidate), max_score else: return "--NME--", 1.0
def _getEventDetailsFromOverview(self, event, n): url = utils.normalizeURL(base=self.DOMAIN, url=self.URL) title = utils.Soup.getTextAt(event, self.SELECTOR_EVENT_TITLE) startStr = utils.Soup.getTextAt(event, self.SELECTOR_EVENT_STARTTIME) if not startStr: logger.error(f'Cannot find start time on page {url}') return None startTime = datetime.strptime(startStr, self.FORMAT_EVENT_DATE) startTime = utils.normalizeDate(startTime, self.config['defaults']['timezone']) rawEvent = RawEvent(self.IDENTIFIER, f'{url}#{n}', title, startTime) location = utils.Soup.getTextAt(event, self.SELECTOR_EVENT_LOCATION) rawEvent.setLocation(location) majors = utils.Soup.getTextAt(event, self.SELECTOR_MAJORS) if majors.lower().startswith('majors'): majors = majors[6:] rawEvent.setAudience(majors) description, links = utils.Soup.tokenizeElemAt(event, self.SELECTOR_EVENT_DESCRIPTION, base=url) links = map(utils.normalizeURL(base=self.URL), links) links = set(filter(None, links)) rawEvent.setDescription(description) rawEvent.setLinks(links) return rawEvent
def getEventList(self): url = utils.normalizeURL(base=self.DOMAIN, url=self.URL) reqData = self.LIST_REQUEST_DATA.copy() backlogTime = datetime.now() - self.config['backlog'] backlogTime = self.config['defaults']['timezone'].localize(backlogTime) reqData['endsAfter'] = backlogTime.replace(microsecond=0).isoformat() res = self.requester.fetchURL(url, data=reqData, json=True) events = set() for event in res['value']: eventURL = utils.normalizeURL(base=self.DOMAIN, url=self.EVENT_URL) eventURL = eventURL.format(event['id']) startTime = datetime.fromisoformat(event['startsOn']) startTime = utils.normalizeDate(startTime, self.config['defaults']['timezone']) rawEvent = RawEvent(self.IDENTIFIER, eventURL, event['name'], startTime) if 'endsOn' in event: endTime = datetime.fromisoformat(event['endsOn']) endTime = utils.normalizeDate(endTime, self.config['defaults']['timezone']) rawEvent.setEnd(endTime) rawEvent.setLocation(event['location']) rawEvent.setExtras(', '.join(event.get('benefitNames', ()))) soup = BeautifulSoup(event['description'], 'html.parser') description, links = utils.HTMLToText.tokenizeSoup(soup, base=url, customStyle=self.DESCRIPTION_STYLE) links = map(utils.normalizeURL(base=eventURL), links) links = set(filter(None, links)) rawEvent.setDescription(description) rawEvent.setLinks(links) rawEvent.setStatus(event['status']) events.add(rawEvent) return events
def appendViews(c, timePickle): m = 0 for cand in c: #print(cand) entity = utils.normalizeURL(cand[0]) view = 0.0 if entity in timePickle: view = timePickle[entity] cand[1]['tp'] = view return c
def appendViews(c, timePickle): m=0 for cand in c: #print(cand) entity=utils.normalizeURL(cand[0]) view=0.0 if entity in timePickle: view=timePickle[entity] cand[1]['tp']=view return c
def disambiguateEntity(candidates, weights, resolvedEntities, factorWeights, maxCount, currentId, limit): if len(candidates): max_score = limit aging_factor = 0.01 best_candidate = None if currentId in resolvedEntities: del resolvedEntities[str(currentId)] candidates = normalizeTPs(candidates) for cand in candidates: candidate = cand[0] ss = cand[1]["ss"] associativeness = cand[1]["count"] / maxCount # normalizationFactor=maxCoherence(weights, min(10,len(resolvedEntities))) normalizationFactor = 1.0 coherence = computeCoherence(candidate, resolvedEntities, weights) / normalizationFactor lastId = getPreviousOccurrence(utils.normalizeURL(candidate), resolvedEntities, currentId - 1) recency = 0.0 if lastId > -1: age = abs(currentId - lastId) recency = (1 - aging_factor)**age temporalPopularity = cand[1]["tp"] score = factorWeights['wss'] * ss + factorWeights[ 'wc'] * coherence + factorWeights[ "wa"] * associativeness + factorWeights[ 'wr'] * recency + factorWeights[ 'wt'] * temporalPopularity #print("%s\tSCORE: %f\tSS: %f\tCoh: %f\tAssoc: %f\tRecency: %f" % (cand[0], score, ss, coherence, associativeness, recency)) if score > limit and (score > max_score or (score == max_score and len(candidate) < len(best_candidate)) ) and not isDisambiguation(candidate): max_score = score best_candidate = candidate return utils.normalizeURL(best_candidate), max_score else: return "--NME--", 1.0
def computeCoherence(newEntity, previousEntities, w): total=0.0 current_id=len(previousEntities)+1 other_id=current_id-1 while other_id>0 and str(current_id-other_id) in w: diff=abs(current_id-other_id) weight=w[str(diff)] max_score=0.0 if diff==1 or shouldITry(max_score, total, diff, current_id, w): # total+=computePairCoherence(graph.node[other_id]['eid'], newEntity.replace('http://dbpedia.org/resource/', ''), weight) if str(other_id) in previousEntities and previousEntities[str(other_id)]!='--NME--': total+=computeShortestPathCoherence(previousEntities[str(other_id)], utils.normalizeURL(newEntity), weight) other_id-=1 else: break return total
def _getLink(elem, base): def _matchesEventLink(link): if not link: return False try: parsed = urlparse(link) except ValueError: return False return (parsed.netloc == 'www.chemistry.gatech.edu' and parsed.path.startswith(self.LINK_PREFIX_INCLUDE)) links = elem.select(self.SELECTOR_EVENT_LINK) links = map(utils.Soup.getElemLink, links) links = map(utils.normalizeURL(base=base), links) links = filter(_matchesEventLink, links) return utils.firstOrNone(links)
def generateCandidatesWithLOTUS(mention, minSize=10, maxSize=100): normalized=utils.normalizeURL(mention) fromCache=rds.get("lotus:%s" % normalized) if fromCache: cands=pickle.loads(fromCache) else: cands=getCandidatesForLemma(mention, minSize, maxSize) cands=cleanRedirects(cands) rds.set("lotus:" + normalized, pickle.dumps(cands)) sortedCands=sorted(cands.items(), key=lambda x:x[1]["count"], reverse=True) #try: maxCount=getMaxCount(cands.items()) #except: # print("we have an issue") # sys.exit(0) # maxCount=1 return sortedCands, maxCount
def generateCandidatesWithLOTUS(mention, minSize=10, maxSize=100): normalized = utils.normalizeURL(mention) fromCache = rds.get("lotus:%s" % normalized) if fromCache: cands = pickle.loads(fromCache) else: cands = getCandidatesForLemma(mention, minSize, maxSize) cands = cleanRedirects(cands) rds.set("lotus:" + normalized, pickle.dumps(cands)) sortedCands = sorted(cands.items(), key=lambda x: x[1]["count"], reverse=True) #try: maxCount = getMaxCount(cands.items()) #except: # print("we have an issue") # sys.exit(0) # maxCount=1 return sortedCands, maxCount
def computeCoherence(newEntity, previousEntities, w): total = 0.0 current_id = len(previousEntities) + 1 other_id = current_id - 1 while other_id > 0 and str(current_id - other_id) in w: diff = abs(current_id - other_id) weight = w[str(diff)] max_score = 0.0 if diff == 1 or shouldITry(max_score, total, diff, current_id, w): # total+=computePairCoherence(graph.node[other_id]['eid'], newEntity.replace('http://dbpedia.org/resource/', ''), weight) if str(other_id) in previousEntities and previousEntities[str( other_id)] != '--NME--': total += computeShortestPathCoherence( previousEntities[str(other_id)], utils.normalizeURL(newEntity), weight) other_id -= 1 else: break return total
def getEventList(self): events = set() lastURL = self.DOMAIN nextURL = self.URL def _getLink(elem, base): def _matchesEventLink(link): if not link: return False try: parsed = urlparse(link) except ValueError: return False return (parsed.netloc == 'www.chemistry.gatech.edu' and parsed.path.startswith(self.LINK_PREFIX_INCLUDE)) links = elem.select(self.SELECTOR_EVENT_LINK) links = map(utils.Soup.getElemLink, links) links = map(utils.normalizeURL(base=base), links) links = filter(_matchesEventLink, links) return utils.firstOrNone(links) while 1: nextURL = utils.normalizeURL(base=lastURL, url=nextURL) overview = self.requester.fetchURL(nextURL) if not overview: break soup = BeautifulSoup(overview, 'html.parser') evs = soup.select(self.SELECTOR_EVENTS) evs = map(lambda l: _getLink(l, nextURL), evs) events |= set(filter(None, evs)) lastURL = nextURL nextURL = utils.Soup.getLinkAt(soup, self.SELECTOR_NEXT_PAGE) if not nextURL: break return events
def _getEventDetails(self, eventURL): details = self.requester.fetchURL(eventURL) if not details: return None soup = BeautifulSoup(details, 'html.parser') title1 = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TITLE1) title2 = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TITLE2) if title1 and title2: title = '%s: %s' % (title1, title2) else: title = utils.firstOrNone(filter(None, (title1, title2))) singleStr = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TIMESINGLE) startStr = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TIMESTART) endStr = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TIMEEND) startTime, endTime = self._parseEventTime(singleStr, startStr, endStr) startTime = utils.normalizeDate(startTime, self.config['defaults']['timezone']) event = RawEvent(self.IDENTIFIER, eventURL, title, startTime) event.setEnd( utils.normalizeDate(endTime, self.config['defaults']['timezone'])) event.setLocation( utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_LOCATION)) description, links = utils.Soup.tokenizeElemAt( soup, self.SELECTOR_EVENT_DESCRIPTION, base=eventURL) links |= set(utils.Soup.getLinksAt(soup, self.SELECTOR_EVENT_LINKS)) links = map(utils.normalizeURL(base=eventURL), links) links = set(filter(None, links)) event.setDescription(description) event.setLinks(links) return event
if __name__=="__main__": if len(sys.argv)<3: print("Not enough arguments!!!") print("python run_naf.py {CORPUS/PATH} {FILENAME.TSV}") sys.exit(1) corpus=sys.argv[1] myFile=sys.argv[2] if not os.path.isfile(myFile): myConll="" corpus=corpus.strip('/') for file in os.listdir(corpus): if not file.endswith(".xml") and not file.endswith(".naf"): continue print(file) filename=corpus + '/' + file myXml, entities, mentions=utils.naf2inlineEntities(filename, True) da=dis_agdistis.disambiguate(myXml, "agdistis") for agd_entity in da: offset=str(agd_entity["start"]) agd_link=utils.normalizeURL(str(agd_entity["disambiguatedURL"])) goldlink=utils.checkRedirects(utils.normalizeURL(str(entities[offset]))) id=file + offset v1,v2=utils.getRanks(goldlink, agd_link) mention=mentions[offset] myConll+="%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (id, goldlink, agd_link, corpus, v1, v2, mention) w=open(myFile, "w") w.write(myConll) p, r, f1=utils.computeStats(myFile) print("Precision: %f, Recall: %f, F1-value: %f" % (p, r, f1))
def getEventList(self): url = utils.normalizeURL(base=self.DOMAIN, url=self.URL) lastPosted = datetime.now() searchBacklog = timedelta(days=self.config['search_backlog']) crawlUntil = lastPosted - searchBacklog page = 0 events = set() while lastPosted > crawlUntil: reqData = self.LIST_REQUEST_DATA.copy() reqData['page'] = page eventList = self.requester.fetchURL(url, method='POST', data=reqData) if not eventList: break res = json.loads(eventList) if (not isinstance(res, list) or self._getHTMLInsertEntry(res) is None): logger.error(f'Malformed response from {url}: {eventList}') break res = self._getHTMLInsertEntry(res).get('data', '') soup = BeautifulSoup(res, 'html.parser') for event in soup.select(self.SELECTOR_EVENT): link = utils.Soup.getLinkAt(event, 'a') if link: normalized = utils.normalizeURL(base=url, url=link) if normalized: events.add(normalized) posted = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_POSTED) if posted: try: postedTime = datetime.strptime(posted, self.FORMAT_POSTED) lastPosted = min(lastPosted, postedTime) except ValueError: logger.warning('Unable to parse posted time {posted}') lastPosted += timedelta(days=1) if not soup.select(self.SELECTOR_NEXT_PAGE): break page += 1 updatesSince = datetime.now() - searchBacklog requestTimestamp = int(updatesSince.timestamp()) url = self.UPDATED_URL.format(requestTimestamp) url = utils.normalizeURL(base=self.DOMAIN, url=url) updatedEventList = self.requester.fetchURL(url, json=True) if updatedEventList is not None: for eventId in updatedEventList: url = self.EVENT_URL.format(eventId) events.add(utils.normalizeURL(base=self.DOMAIN, url=url)) else: logger.warning('Cannot fetch recently updated events!') return events
def run(corpus, myFile, topic, aggregatedTopics): if not os.path.isfile(myFile) or aggregatedTopics: entitiesNumber = 0 with open(corpus, "r") as myCorpus: currentArticle = "" currentTopic = "" myConll = "" openEntity = False articleEntities = 0 registeredEntities = 0 relevant = False offset = 0 tid = 1 allTokens = {} goldEntities = {} goldMentions = {} print("Topic is %s" % topic) for line in myCorpus: if line.startswith("-DOCSTART-"): if 'testb' in line: if currentArticle != "": if openEntity: openEntity = False allTokens[str(tid - 1)]['text'] += '</entity>' registeredEntities += 1 if registeredEntities < articleEntities: print(registeredEntities, articleEntities) sys.exit(0) if not aggregatedTopics: articleEntities = 0 registeredEntities = 0 myXml = utils.composeText(allTokens) da = dis_agdistis.disambiguate( myXml, "agdistis") for agd_entity in sorted( da, key=lambda k: k['start']): offset = str(agd_entity["start"]) agd_link = utils.normalizeURL( agd_entity["disambiguatedURL"]) goldlink = utils.checkRedirects( utils.normalizeURL( goldEntities[offset])) id = currentArticle + offset mention = goldMentions[offset] v1, v2 = utils.getRanks(goldlink, agd_link) myConll += "%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % ( id, goldlink, agd_link, currentTopic, v1, v2, mention) testB = True line = line.strip() articleInfo = line.split('\t') currentTopic = articleInfo[1] if aggregatedTopics and topic != currentTopic: relevant = False currentArticle = '' else: currentArticle = articleInfo[0] relevant = True print("Article %s has topic %s." % (currentArticle, currentTopic)) if not aggregatedTopics: offset = 0 tid = 1 allTokens = {} goldEntities = {} goldMentions = {} else: testB = False elif testB and relevant: tokenInfo = line.split('\t') text = tokenInfo[0] if tokenInfo[1].strip() != 'I' and openEntity is True: openEntity = False allTokens[str(tid - 1)]['text'] += '</entity>' registeredEntities += 1 if tokenInfo[1].strip() == 'B': goldMentions[str(offset)] = tokenInfo[2].strip() entitiesNumber += 1 articleEntities += 1 if tokenInfo[3] == '--NME--': goldEntities[str(offset)] = tokenInfo[3] else: goldEntities[str(offset)] = tokenInfo[4] text = '<entity>' + text if tokenInfo[0].strip() == tokenInfo[2].strip(): text += '</entity>' registeredEntities += 1 else: openEntity = True allTokens[str(tid)] = {'text': text, 'offset': str(offset)} offset += len(tokenInfo[0]) + 1 tid += 1 if openEntity: allTokens[str(tid - 1)]['text'] += '</entity>' registeredEntities += 1 if registeredEntities < articleEntities: print(registeredEntities, articleEntities) sys.exit(0) if currentArticle or aggregatedTopics: if aggregatedTopics: currentTopic = topic myXml = utils.composeText(allTokens) da = dis_agdistis.disambiguate(myXml, "agdistis") for agd_entity in sorted(da, key=lambda k: k['start']): offset = str(agd_entity["start"]) agd_link = utils.normalizeURL( agd_entity["disambiguatedURL"]) goldlink = utils.checkRedirects( utils.normalizeURL(goldEntities[offset])) mention = goldMentions[offset] id = currentArticle + offset v1, v2 = utils.getRanks(goldlink, agd_link) print(v1, v2) myConll += "%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % ( id, goldlink, agd_link, currentTopic, v1, v2, mention) print(entitiesNumber) with open(myFile, "a") as w: w.write(myConll)
def _getEventDetails(self, eventURL): details = self.requester.fetchURL(f'{eventURL}/xml', errorOnCode=(403, )) if not details: return None soup = BeautifulSoup(details, 'xml') if utils.Soup.getTextAt(soup, self.SELECTOR_TYPE) != 'event': return set() title = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TITLE) location = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_LOCATION) description = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_DESCRIPTION) links = set() if description: descSoup = BeautifulSoup(description, 'html.parser') description, links = utils.HTMLToText.tokenizeSoup(descSoup, base=eventURL) relatedLinks = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_LINKS) if relatedLinks: links |= set(relatedLinks) urls = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_URLS) if urls: links |= set(urls) links = map(utils.normalizeURL(base=eventURL), links) links = set(filter(None, links)) audience = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_AUDIENCE) if audience: audience = ', '.join(audience) extras = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_EXTRAS) if extras: def _cleanExtra(xtr): return xtr.replace('_', ' ').capitalize() extras = ', '.join(map(_cleanExtra, extras)) # need to fetch human readable site because XML does not contain status status = None details2 = self.requester.fetchURL(eventURL, errorOnCode=(403, )) soup2 = BeautifulSoup(details2 or '', 'lxml') if details2: # HTML tree is broken sometimes, which will confuse the python parser statusBlock = self._getBlockFromList( soup2, self.SELECTOR_EVENT_HTMLMETADATA, 'status') if statusBlock: statusElem = self._getBlockDetail(statusBlock, 'workflow status') status = utils.Soup.getElemText(statusElem) status = status.lower() # parse dates from HTML, because timezones and recurring events are # entirely messed up in the XML (some rrules are really weird and in no # way generate the event instances listed in HTML, eg http://hg.gatech.edu/node/623952) htmlTimes = soup2.select(self.SELECTOR_EVENT_HTMLTIMES) events = set() for n, timeEntry in enumerate(htmlTimes): startTime, endTime = self._parseHTMLEventTime(timeEntry) if not startTime: logger.error( f'Cannot parse event time for {eventURL}: {utils.Soup.getElemText(timeEntry)}' ) continue rawEvent = RawEvent(self.IDENTIFIER, f'{eventURL}#{n}', title, startTime) rawEvent.setEnd(endTime) rawEvent.setDescription(description) rawEvent.setLinks(links) if location: rawEvent.setLocation(location) if audience: rawEvent.setAudience(audience) if extras: rawEvent.setExtras(extras) if status: rawEvent.setStatus(status) events.add(rawEvent) return events
sys.exit(1) corpus = sys.argv[1] myFile = sys.argv[2] if not os.path.isfile(myFile): myConll = "" corpus = corpus.strip('/') for file in os.listdir(corpus): if not file.endswith(".xml") and not file.endswith(".naf"): continue print(file) filename = corpus + '/' + file myXml, entities, mentions = utils.naf2inlineEntities( filename, True) da = dis_agdistis.disambiguate(myXml, "agdistis") for agd_entity in da: offset = str(agd_entity["start"]) agd_link = utils.normalizeURL( str(agd_entity["disambiguatedURL"])) goldlink = utils.checkRedirects( utils.normalizeURL(str(entities[offset]))) id = file + offset v1, v2 = utils.getRanks(goldlink, agd_link) mention = mentions[offset] myConll += "%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % ( id, goldlink, agd_link, corpus, v1, v2, mention) w = open(myFile, "w") w.write(myConll) p, r, f1 = utils.computeStats(myFile) print("Precision: %f, Recall: %f, F1-value: %f" % (p, r, f1))
def run(corpus, myFile, topic, aggregatedTopics): if not os.path.isfile(myFile) or aggregatedTopics: entitiesNumber=0 with open(corpus, "r") as myCorpus: currentArticle="" currentTopic="" myConll="" openEntity=False articleEntities=0 registeredEntities=0 relevant=False offset=0 tid=1 allTokens={} goldEntities={} goldMentions={} print("Topic is %s" % topic) for line in myCorpus: if line.startswith("-DOCSTART-"): if 'testb' in line: if currentArticle!="": if openEntity: openEntity=False allTokens[str(tid-1)]['text']+='</entity>' registeredEntities+=1 if registeredEntities<articleEntities: print(registeredEntities, articleEntities) sys.exit(0) if not aggregatedTopics: articleEntities=0 registeredEntities=0 myXml=utils.composeText(allTokens) da=dis_agdistis.disambiguate(myXml, "agdistis") for agd_entity in sorted(da, key=lambda k: k['start']): offset=str(agd_entity["start"]) agd_link=utils.normalizeURL(agd_entity["disambiguatedURL"]) goldlink=utils.checkRedirects(utils.normalizeURL(goldEntities[offset])) id=currentArticle + offset mention=goldMentions[offset] v1,v2=utils.getRanks(goldlink, agd_link) myConll+="%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (id, goldlink, agd_link, currentTopic, v1, v2, mention) testB=True line=line.strip() articleInfo=line.split('\t') currentTopic=articleInfo[1] if aggregatedTopics and topic!=currentTopic: relevant=False currentArticle='' else: currentArticle=articleInfo[0] relevant=True print("Article %s has topic %s." % (currentArticle, currentTopic)) if not aggregatedTopics: offset=0 tid=1 allTokens={} goldEntities={} goldMentions={} else: testB=False elif testB and relevant: tokenInfo=line.split('\t') text=tokenInfo[0] if tokenInfo[1].strip()!='I' and openEntity is True: openEntity=False allTokens[str(tid-1)]['text']+='</entity>' registeredEntities+=1 if tokenInfo[1].strip()=='B': goldMentions[str(offset)]=tokenInfo[2].strip() entitiesNumber+=1 articleEntities+=1 if tokenInfo[3]=='--NME--': goldEntities[str(offset)]=tokenInfo[3] else: goldEntities[str(offset)]=tokenInfo[4] text='<entity>' + text if tokenInfo[0].strip()==tokenInfo[2].strip(): text+='</entity>' registeredEntities+=1 else: openEntity=True allTokens[str(tid)]={'text': text, 'offset': str(offset)} offset+=len(tokenInfo[0]) + 1 tid+=1 if openEntity: allTokens[str(tid-1)]['text']+='</entity>' registeredEntities+=1 if registeredEntities<articleEntities: print(registeredEntities, articleEntities) sys.exit(0) if currentArticle or aggregatedTopics: if aggregatedTopics: currentTopic=topic myXml=utils.composeText(allTokens) da=dis_agdistis.disambiguate(myXml, "agdistis") for agd_entity in sorted(da, key=lambda k: k['start']): offset=str(agd_entity["start"]) agd_link=utils.normalizeURL(agd_entity["disambiguatedURL"]) goldlink=utils.checkRedirects(utils.normalizeURL(goldEntities[offset])) mention=goldMentions[offset] id=currentArticle + offset v1,v2=utils.getRanks(goldlink, agd_link) print(v1,v2) myConll+="%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (id, goldlink, agd_link, currentTopic, v1, v2, mention) print(entitiesNumber) with open(myFile, "a") as w: w.write(myConll)