class GerbilNIFCollection: def __init__(self, context: str, mention: str): self.collection = NIFCollection() self.context = context self.mention = mention self.phrases = self.collection.add_context(self.context, self.mention) @property def turtle(self) -> str: return self.collection.dumps(format='turtle')
def gerbil_handler(): def extract_string() -> Tuple[str, str]: for triple in nif.triples(): if 'isString' in triple[1]: return str(triple[0]), str(triple[2]) nif = NIFCollection.loads(request.data.decode('utf-8')) hid = request.args['handler_id'] if 'annotator' not in request.args: with lck: queries[hid]['test']['context'], queries[hid]['test'][ 'query'] = extract_string() a = _wait(lambda: queries[hid]['test']['answer']) with lck: queries[hid]['test']['answer'] = None return a else: with lck: an = queries[hid]['experiment']['annotators'][ request.args['annotator']] an['context'], an['query'] = extract_string() a = _wait(lambda: an['answer']) with lck: an['answer'] = None return a
def main(mode, infile, outfile, format): """ Conversion utility for NIF files. This converts the identifiers used to annotate mentions in documents across knowledge bases. For instance, the following will convert a NIF file with DBpedia identifiers to a NIF file with Wikidata identifiers: nifconverter --mode dbr:wd -i dbpedia_nif.ttl -o wikidata_nif.ttl """ converter = registered_converters.get(mode) if converter is None: raise click.BadParameter('Invalid mode. Supported modes are: ' + get_allowed_modes()) translator = NIFTranslator(converter) with click.open_file(infile) as f: nif = NIFCollection.loads(f.read()) translator.translate_collection(nif) with click.open_file(outfile, 'w') as out: out.write(nif.dumps())
def train_classifier(collection, bow, pagerank, dataset, output, max_iter): """ Trains a tag classifier on a NIF dataset. """ if output is None: output = 'trained_classifier.pkl' b = BOWLanguageModel() b.load(bow) graph = WikidataGraph() graph.load_pagerank(pagerank) tagger = Tagger(collection, b, graph) d = NIFCollection.load(dataset) clf = SimpleTagClassifier(tagger) max_iter = int(max_iter) parameter_grid = [] for max_distance in [50, 75, 150, 200]: for similarity, beta in [('one_step', 0.2), ('one_step', 0.1), ('one_step', 0.3)]: for C in [10.0, 1.0, 0.1]: for smoothing in [0.8, 0.6, 0.5, 0.4, 0.3]: parameter_grid.append({ 'nb_steps': 4, 'max_similarity_distance': max_distance, 'C': C, 'similarity': similarity, 'beta': beta, 'similarity_smoothing': smoothing, }) best_params = clf.crossfit_model(d, parameter_grid, max_iter=max_iter) print('#########') print(best_params) clf.save(output)
def processQueryNif(): print("inside") content_format = request.headers.get('Content') or 'application/x-turtle' nif_body = request.data.decode("utf-8") print(nif_body) try: nif_doc = NIFCollection.loads(nif_body, format='turtle') #print(nif_doc) for context in nif_doc.contexts: vectors = v.vectorise(context.mention) entities = p.link(vectors) s = set() for idx, entityarr in entities.items(): for ent in entityarr: s.add(ent[0]) for entity in s: context.add_phrase( beginIndex=0, endIndex=1, taIdentRef='http://www.wikidata.org/entity/' + entity) resp = Response(nif_doc.dumps()) print(nif_doc.dumps()) resp.headers['content-type'] = content_format return resp except Exception as e: print(e) return '' return ''
def main(converter, target, infile, outfile, format): """ Conversion utility for NIF files. This converts the identifiers used to annotate mentions in documents across knowledge bases. For instance, the following will convert a NIF file with DBpedia identifiers to a NIF file with Wikidata identifiers, using the default converter (which uses the DBpedia SameThing service): nifconverter -i dbpedia_nif.ttl -o wikidata_nif.ttl """ converter_impl = registered_converters.get(converter) if converter_impl is None: raise click.BadParameter( 'Invalid converter "{}". Supported converters are: {}'.format( converter, get_available_converters())) translator = NIFTranslator(converter_impl(target_prefix=target)) with click.open_file(infile) as f: nif = NIFCollection.loads(f.read()) translator.translate_collection(nif) with click.open_file(outfile, 'w') as out: out.write(nif.dumps())
def annotation2nif(collection_name, tweet): collection = NIFCollection(uri=collection_name) context_name = collection_name + str(tweet.idTweet) context = collection.add_context(uri=context_name, mention=tweet.text) if len(tweet.mentions) > 0: for i, mention in enumerate(tweet.mentions): if tweet.entities[i] != 'NIL': entity = tweet.entities[i].replace( 'dbr:', 'http://dbpedia.org/resource/') else: entity = 'http://optic.ufsc.br/resource/NIL/' context.add_phrase(beginIndex=int(mention[2]), endIndex=int(mention[3]), annotator='http://optic.ufsc.br', taIdentRef=entity) nif = collection.dumps(format='turtle') return nif
def nif2json(lang="en"): paths = ["./VoxEL/rVoxEL-{}.ttl", "./VoxEL/sVoxEL-{}.ttl"] prefix = ["r", "s"] for path, p in zip(paths, prefix): with open(path.format(lang)) as f: data = NIFCollection.loads(f.read(), format='turtle') out = nif2dict(data) with open("./{}_{}.json".format(p, lang), "w") as f: json.dump(out, f, indent=4)
def nif_2_annotations(nif_collection): annotations = defaultdict(list) temp_annotations = defaultdict(list) keys = [] parsed_collection = NIFCollection.loads(nif_collection, format='turtle') for context in parsed_collection.contexts: for phrase in context.phrases: id_annotation = phrase.context.rsplit('/', 1)[-1] entity = phrase.taIdentRef keys.append(int(id_annotation)) temp_annotations[int(id_annotation)].append(entity) keys.sort() for key in keys: annotations[key] = temp_annotations[key] return annotations
def d2kb(): data = request.data data = data.rstrip() data = data.lstrip() nif_post = NIFCollection.loads(data.decode('utf-8'), format='turtle') mentions = [] for context in nif_post.contexts: tweet = Tweet() tweet.mentions = [] tweet.idTweet = context.uri tweet.text = context.mention try: for phrase in context.phrases: single_mention = (phrase.mention, phrase.beginIndex, phrase.endIndex) mentions.append(single_mention) except: print('no mentions') if len(mentions) > 0: if VERBOSE == 'yes': print('\n\n:::: PREPROCESSING ::::\n\n') start = time.time() tweet = preprocessing_d2kb(tweet, mentions, VERBOSE) end = time.time() if VERBOSE == 'yes': print('Running time: {}'.format(end - start)) if VERBOSE == 'yes': print('\n\n:::: ENTITY SELECTION ::::\n\n') start = time.time() tweet.candidates = select_candidates(tweet, vocab2idx, TYPE, MAX, BOOST, VERBOSE) end = time.time() if VERBOSE == 'yes': print('Running time: {}'.format(end - start)) if VERBOSE == 'yes': print('\n\n:::: DISAMBIGUATION ::::\n\n') start = time.time() tweet.entities = disambiguate_mentions(tweet, THRESHOLD, model, device, vocab2idx, WS, EXTRA, VERBOSE) end = time.time() if VERBOSE == 'yes': print('Running time: {}'.format(end - start)) collection_name = "http://optic.ufsc.br/" nif = annotation2nif(collection_name, tweet) return nif
def nif_api(*args, **kwargs): content_format = request.headers.get('Content') or 'application/x-turtle' content_type_to_format = { 'application/x-turtle': 'turtle', 'text/turtle': 'turtle', } nif_body = request.body.read() nif_doc = NIFCollection.loads(nif_body) for context in nif_doc.contexts: logger.debug(context.mention) mentions = classifier.create_mentions(context.mention) classifier.classify_mentions(mentions) for mention in mentions: mention.add_phrase_to_nif_context(context) response.set_header('content-type', content_format) return nif_doc.dumps()
def setUpClass(cls): cls.testdir = os.path.dirname(os.path.abspath(__file__)) # Load dummy bow bow_fname = os.path.join(cls.testdir, 'data/sample_bow.pkl') cls.bow = BOWLanguageModel() cls.bow.load(bow_fname) # Load dummy graph graph_fname = os.path.join(cls.testdir, 'data/sample_wikidata_items.npz') pagerank_fname = os.path.join(cls.testdir, 'data/sample_wikidata_items.pgrank.npy') cls.graph = WikidataGraph() cls.graph.load_from_matrix(graph_fname) cls.graph.load_pagerank(pagerank_fname) # Load dummy profile cls.profile = IndexingProfile.load( os.path.join(cls.testdir, 'data/all_items_profile.json')) # Setup solr index (TODO delete this) and tagger cls.tf = TaggerFactory() cls.collection_name = 'wd_test_collection' try: cls.tf.create_collection(cls.collection_name) except CollectionAlreadyExists: pass cls.tf.index_stream( cls.collection_name, WikidataDumpReader( os.path.join(cls.testdir, 'data/sample_wikidata_items.json.bz2')), cls.profile) cls.tagger = Tagger(cls.collection_name, cls.bow, cls.graph) # Load NIF dataset cls.nif = NIFCollection.load( os.path.join(cls.testdir, 'data/five-affiliations.ttl')) cls.classifier = SimpleTagClassifier(cls.tagger, max_similarity_distance=10, similarity_smoothing=2)
# ******************************** # # Start OPTIC count = 0 # Read directory with tweets to be annotated inputs = set() for nif_temp in os.listdir(INPUT_PATH): # Initially, we works only with RDF turtle standard if (fnmatch.fnmatch(nif_temp, '*.ttl')): inputs.add(nif_temp) for nif_input in inputs: nif_file = '' with open(INPUT_PATH + nif_input, 'r') as f: nif_file = f.read() nif_post = NIFCollection.loads(nif_file, format='turtle') for context in nif_post.contexts: tweet = Tweet() tweet.idTweet = context.uri tweet.text = context.mention tweet.mentions = [] # A2KB Mode # TODO if MODE == 'a2kb': continue # D2KB Mode else: mentions = [] try:
from pynif import NIFCollection import sys,os,json,re d = json.loads(open('input/webqsp.test.entities.with_classes.json').read()) collection = NIFCollection(uri="http://sda.tech/webquestions") for item in d: if not item['utterance']: continue uid = item['question_id'] context = collection.add_context(uri="http://sda.tech/webquestions/%s"%uid, mention=item['utterance']) beg = 0 for entity in item['entities']: if entity is None: continue context.add_phrase(taIdentRef='http://www.wikidata.org/entity/'+entity, beginIndex=beg, endIndex=beg+1) beg+=1 generated_nif = collection.dumps(format='turtle') f = open('webqsp.test.nif','w') f.write(generated_nif) f.close()
from pynif import NIFCollection import sys, os, json, re d = json.loads(open('test.json').read()) collection = NIFCollection(uri="http://sda.tech/lcquadv2") for item in d: if not item['question']: continue uid = item['uid'] context = collection.add_context(uri="http://sda.tech/lcquadv2/%s" % uid, mention=item['question']) entities = re.findall(r'wd:([Q][0-9]*)', item['sparql_wikidata']) beg = 0 for entity in entities: context.add_phrase(taIdentRef='http://www.wikidata.org/entity/' + entity, beginIndex=beg, endIndex=beg + 1) beg += 1 generated_nif = collection.dumps(format='turtle') f = open('lcquad2.0.test.nif', 'w') f.write(generated_nif) f.close()
from pynif import NIFCollection import sys,os,json,re gold = [] f = open('annotated_wd_data_train.txt') for line in f.readlines(): line = line.strip() s,p,o,q = line.split('\t') gold.append((s,q)) collection = NIFCollection(uri="http://sda.tech/simplequestions") for idx,item in enumerate(gold): question = item[1] entity = item[0] uid = idx context = collection.add_context(uri="http://sda.tech/simplequestions/%s"%uid, mention=question) context.add_phrase(taIdentRef='http://www.wikidata.org/entity/'+entity, beginIndex=0, endIndex=1) generated_nif = collection.dumps(format='turtle') f = open('simplequestions.train.nif','w') f.write(generated_nif) f.close()
def setUpClass(cls): testdir = os.path.dirname(os.path.abspath(__file__)) cls.dbpedia_nif = NIFCollection.load( os.path.join(testdir, 'data/sample_dbpedia.ttl')) cls.wikipedia_nif = NIFCollection.load( os.path.join(testdir, 'data/sample_wikipedia.ttl'))
def __init__(self, context: str, mention: str): self.collection = NIFCollection() self.context = context self.mention = mention self.phrases = self.collection.add_context(self.context, self.mention)
from pynif import NIFCollection import sys, os, json, re d = json.loads(open('input/webqsp.test.entities.with_classes.json').read()) collection = NIFCollection(uri="http://lc-quad2.sda.tech") for item in d: if not item['utterance']: continue uid = item['question_id'] context = collection.add_context(uri="http://webqsp.sda.tech/%s" % uid, mention=item['utterance']) for entity in item['entities']: if entity is None: continue print(entity) context.add_phrase(taIdentRef='http://www.wikidata.org/entity/' + entity, beginIndex=0, endIndex=1) generated_nif = collection.dumps(format='turtle') f = open('webqsp.test.nif', 'w') f.write(generated_nif) f.close()