def cacheRedirects(): g = Graph() g.parse(path + redirectsFile, format="n3") print("File loaded in rdflib graph") for s, p, o in g: if str(p) == "http://dbpedia.org/ontology/wikiPageRedirects": k = 'rdr:%s' % utils.normalizeURL(str(s)) v = utils.normalizeURL(str(o)) rds.set(k, v)
def cacheDisambiguations(): g = Graph() g.parse(path + disambiguationFile, format='n3') print("File loaded in rdflib graph") predicate = URIRef("http://dbpedia.org/ontology/wikiPageDisambiguates") subjects = set(g.subjects(predicate=predicate)) for subject in subjects: v = list( map(lambda x: utils.normalizeURL(str(x)), g.objects(subject, predicate))) k = 'dis:%s' % utils.normalizeURL(subject) rds.set(k, v)
def load_article_from_xml_files(location, collection='msnbc'): """ Load a dataset in XML format. """ news_items = set() for filename in glob.glob(location): parser = etree.XMLParser(recover=True) xml = etree.parse(filename, parser) news_item_obj = classes.NewsItem(identifier=filename, collection=collection) for entity_mention in xml.iterfind('/ReferenceInstance'): mention = entity_mention.find('SurfaceForm').text.strip() offset = int(entity_mention.find('Offset').text.strip()) length = int(entity_mention.find('Length').text.strip()) raw_gold = entity_mention.find('ChosenAnnotation').text gold_link = utils.getLinkRedirect(utils.normalizeURL(raw_gold)) if utils.computePR(gold_link) == 0: gold_link = None entity_obj = classes.EntityMention(begin_index=offset, end_index=offset + length, mention=mention, gold_link=gold_link) news_item_obj.entity_mentions.append(entity_obj) news_items.add(news_item_obj) return news_items
def cachePR(): lines = open(path + pagerankFile, 'r') for line in lines: s, o = line.split() k = 'pr:%s' % utils.normalizeURL(s) v = round(float(o), 4) rds.set(k, v)
def load_article_from_nif_file(nif_file, limit=1000000, collection='wes2015'): """ Load a dataset in NIF format. """ g = Graph() g.parse(nif_file, format="n3") news_items = set() articles = g.query(""" SELECT ?articleid ?date ?string WHERE { ?articleid nif:isString ?string . OPTIONAL { ?articleid <http://purl.org/dc/elements/1.1/date> ?date . } } LIMIT %d""" % limit) for article in articles: news_item_obj = classes.NewsItem( content=article['string'], identifier=article[ 'articleid'], #"http://yovisto.com/resource/dataset/iswc2015/doc/281#char=0,4239", dct=article['date'], collection=collection) query = """ SELECT ?id ?mention ?start ?end ?gold WHERE { ?id nif:anchorOf ?mention ; nif:beginIndex ?start ; nif:endIndex ?end ; nif:referenceContext <%s> . OPTIONAL { ?id itsrdf:taIdentRef ?gold . } } ORDER BY ?start""" % str(article['articleid']) qres_entities = g.query(query) for entity in qres_entities: gold_link = utils.getLinkRedirect( utils.normalizeURL(str(entity['gold']))) if gold_link.startswith('http://aksw.org/notInWiki'): gold_link = '--NME--' page_rank = utils.computePR(gold_link) entity_obj = classes.EntityMention(begin_index=int( entity['start']), end_index=int(entity['end']), mention=str(entity['mention']), gold_link=gold_link, gold_pr=page_rank) news_item_obj.entity_mentions.append(entity_obj) news_items.add(news_item_obj) return news_items