def get_semantic_associations(fn=None, limit=None): if not fn: verified_mappings = get_verified_mappings() semantic_associations = get_dbpedia_pairs_from_mappings( verified_mappings) semantic_associations = [URIRefify(p) for p in semantic_associations] else: semantic_associations = [] with gzip.open(fn) if fn.endswith('.gz') else open(fn) as f: # expects a file with one space separated pair of n3 encoded IRIs # per line r = csv.DictReader( f, delimiter=b' ', doublequote=False, escapechar=None, quoting=csv.QUOTE_NONE, ) assert r.fieldnames == ['source', 'target'] for i, row in enumerate(r): if limit and i >= limit: break source = from_n3(row['source'].decode('UTF-8')) target = from_n3(row['target'].decode('UTF-8')) semantic_associations.append((source, target)) return semantic_associations
def test_util_from_n3_expectliteralwithdatatypefrombool(self): s = 'true' res = util.from_n3(s) self.assertEqual(res, Literal(True)) s = 'false' res = util.from_n3(s) self.assertEqual(res, Literal(False))
def test_util_from_n3_expectpartialidempotencewithn3(self): for n3 in ('<http://ex.com/foo>', '"foo"@de', #'"\\""', # exception as '\\"' --> '"' by orig parser as well '"""multi\n"line"\nstring"""@en'): self.assertEqual(util.from_n3(n3).n3(), n3, 'from_n3(%(n3e)r).n3() != %(n3e)r' % {'n3e': n3})
def test_util_from_n3_expectsameasn3parser(self): def parse_n3(term_n3): ''' Disclaimer: Quick and dirty hack using the n3 parser. ''' prepstr = ("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n" "<urn:no_use> <urn:no_use> %s.\n" % term_n3) g = ConjunctiveGraph() g.parse(data=prepstr, format='n3') return [t for t in g.triples((None, None, None))][0][2] for n3 in ( # "michel", # won't parse in original parser # "_:michel", # BNodes won't be the same '"michel"', '<http://example.org/schema>', '"michel"@fr', # '"michel"@fr^^xsd:fr', # FIXME: invalid n3, orig parser will prefer datatype # '"true"^^xsd:boolean', # FIXME: orig parser will expand xsd prefix '42', 'true', 'false', '"""multi\nline\nstring"""@en', '<http://ex.com/foo>', '"foo"@de', '"\\""@en', '"""multi\n"line"\nstring"""@en'): res, exp = util.from_n3(n3), parse_n3(n3) self.assertEqual( res, exp, 'from_n3(%(n3e)r): %(res)r != parser.notation3: %(exp)r' % { 'res': res, 'exp': exp, 'n3e': n3 })
def test_util_from_n3_expectsameasn3parser(self): def parse_n3(term_n3): ''' Disclaimer: Quick and dirty hack using the n3 parser. ''' prepstr = ("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n" "<urn:no_use> <urn:no_use> %s.\n" % term_n3) g = ConjunctiveGraph() g.parse(data=prepstr, format='n3') return [t for t in g.triples((None, None, None))][0][2] for n3 in (# "michel", # won't parse in original parser # "_:michel", # BNodes won't be the same '"michel"', '<http://example.org/schema>', '"michel"@fr', # '"michel"@fr^^xsd:fr', # FIXME: invalid n3, orig parser will prefer datatype # '"true"^^xsd:boolean', # FIXME: orig parser will expand xsd prefix '42', 'true', 'false', '"""multi\nline\nstring"""@en', '<http://ex.com/foo>', '"foo"@de', '"\\""@en', '"""multi\n"line"\nstring"""@en'): res, exp = util.from_n3(n3), parse_n3(n3) self.assertEquals(res, exp, 'from_n3(%(n3e)r): %(res)r != parser.notation3: %(exp)r' % { 'res': res, 'exp': exp, 'n3e':n3})
def main(): '''Parse args and handle options.''' parser = argparse.ArgumentParser(description='Object lister for Fedora 4.') # Path to the repo config (endpoint, relpath, credentials, and WebAC paths) parser.add_argument('-r', '--repo', help='Path to repository configuration file.', action='store', required=True) # long mode to print more than just the URIs (name modeled after ls -l) parser.add_argument('-l', '--long', help='Display additional information besides the URI', action='store_true') parser.add_argument( '-R', '--recursive', help= 'List additional objects found by traversing the given predicate(s)', action='store') args = parser.parse_args() # configure logging with open('config/logging.yml', 'r') as configfile: logging_config = yaml.safe_load(configfile) logfile = 'logs/list.py.{0}.log'.format( datetime.utcnow().strftime('%Y%m%d%H%M%S')) logging_config['handlers']['file']['filename'] = logfile logging_config['handlers']['console']['stream'] = 'ext://sys.stderr' logging.config.dictConfig(logging_config) # Load required repository config file and create repository object with open(args.repo, 'r') as repoconfig: fcrepo = Repository(yaml.safe_load(repoconfig)) logger.info('Loaded repo configuration from {0}'.format(args.repo)) if args.recursive is not None: manager = namespaces.get_manager() args.predicates = [ from_n3(p, nsm=manager) for p in args.recursive.split(',') ] logger.info( 'Listing will traverse the following predicates: {0}'.format( ', '.join([p.n3() for p in args.predicates]))) else: args.predicates = [] for item_uri in sys.stdin: for (uri, graph) in fcrepo.recursive_get(item_uri.rstrip('\n'), traverse=args.predicates): if args.long: title = get_title_string(graph) print("{0} {1}".format(uri, title)) else: print(uri)
def term_to_rdflib(term: str) -> Term: """Convert an HDT term into its RDFlib representation.""" if term.startswith('?'): return Variable(term[1:]) elif term.startswith("\""): return from_n3(term) else: return URIRef(term)
def to_rdflib_term(value): """Convert a N3 term to a RDFLib Term""" if value.startswith('http'): return URIRef(value) elif '"^^http' in value: index = value.find('"^^http') value = "{}<{}>".format(value[0:index + 3], value[index + 3:]) return from_n3(value)
def ask_LITERAL(self, g, sections, var, prompt): answer = self.input(prompt) if answer.startswith('"') or answer.startswith("'"): return util.from_n3(answer) else: return Literal(answer, lang=var.langhint, datatype=var.datatypehint)
def lookup_blanks(self, g, bn, conn): """Recursively find any relevant blank nodes for the current lookup @param g The graph @param bn The blank node ID (starting _:) @param conn The database connection """ cursor = conn.cursor() cursor.execute("""select subject, property, object from triples where page="<BLANK>" """, (bn[2:],)) rows = cursor.fetchall() if rows: for s, p, o in rows: g.add((from_n3(s), from_n3(p), from_n3(o))) if o.startswith("_:"): self.lookup_blanks(g, o, conn) cursor.close()
def test_util_from_n3_not_escapes(self) -> None: strings = [ "jörn", "j\\xf6rn", ] for string in strings: with self.subTest(f"{string}"): literal_str = str(util.from_n3(f'"{string}"')) self.assertEqual(literal_str, f"{string}")
def test_util_from_n3_not_escapes_xf(self) -> None: strings = [ f"j\\366rn", f"\\", f"\\0", f"\\I", ] for string in strings: with self.subTest(f"{string}"): literal_str = str(util.from_n3(f'"{string}"')) self.assertEqual(literal_str, f"{string}")
def ask_NODE(self, g, sections, var, prompt): answer = self.input(prompt) if answer.startswith("c") and var.classhint and var.classhint in sections: s = sections[answer[1:].strip()] node = s.construct(g, sections, None) print("back to {}".format(self.name), file=self.out) return node elif answer: return util.from_n3(answer) else: return None
def uri_or_curie(arg: str): if arg and (arg.startswith('http://') or arg.startswith('https://')): # looks like an absolute HTTP URI return URIRef(arg) try: term = from_n3(arg, nsm=namespaces.get_manager()) except KeyError: raise ArgumentTypeError( f'"{arg[:arg.index(":") + 1]}" is not a known prefix') if not isinstance(term, URIRef): raise ArgumentTypeError('must be a URI or CURIE') return term
def ask_NODE(self, g, sections, var, prompt): answer = self.input(prompt) if answer.startswith( "c") and var.classhint and var.classhint in sections: s = sections[answer[1:].strip()] node = s.construct(g, sections, None) print("back to {}".format(self.name), file=self.out) return node elif answer: return util.from_n3(answer) else: return None
def summarize(self, id): """Summarize an id @param id The id @return A RDFlib Graph or None if the ID is not found """ g = ConjunctiveGraph() conn = sqlite3.connect(self.db) cursor = conn.cursor() cursor.execute( """select subject, property, object from triples where subject=?""", ("<%s%s>" % (BASE_NAME, unicode_escape(id)),)) rows = cursor.fetchall() added = 0 if rows: for s, p, o in rows: for f in FACETS: if added < 20 and str(p)[1:-1] == f["uri"]: g.add((from_n3(s), from_n3(p), from_n3(o))) added += 1 conn.close() return g
def create_class_from_mapping(mapping, rdf_type=None): cls = type('csv', (pcdm.Item, ), {}) for column, conf in mapping.items(): if 'predicate' in conf: pred_uri = from_n3(conf['predicate'], nsm=nsm) if conf.get('uriref', False): add_property = rdf.object_property(column, pred_uri) else: if 'datatype' in conf: datatype = from_n3(conf['datatype'], nsm=nsm) else: datatype = None add_property = rdf.data_property(column, pred_uri, datatype=datatype) add_property(cls) if rdf_type is not None: add_type = rdf.rdf_class(rdf_type) add_type(cls) return cls
def to_rdflib_term(value: str) -> Union[Literal, URIRef, Variable]: """Convert a N3 term to a RDFLib Term. Argument: A RDF Term in N3 format. Returns: The RDF Term in rdflib format. """ if value.startswith('http'): return URIRef(value) elif '"^^http' in value: index = value.find('"^^http') value = f"{value[0:index+3]}<{value[index+3:]}>" return from_n3(value)
def test_util_from_n3_expectpartialidempotencewithn3(self): for n3 in ( "<http://ex.com/foo>", '"foo"@de', u"<http://ex.com/漢字>", u"<http://ex.com/a#あ>", # '"\\""', # exception as '\\"' --> '"' by orig parser as well '"""multi\n"line"\nstring"""@en', ): self.assertEqual( util.from_n3(n3).n3(), n3, "from_n3(%(n3e)r).n3() != %(n3e)r" % {"n3e": n3}, )
def __init__(self, repo, config): self.logger = logging.getLogger(__name__ + '.' + self.__class__.__name__) # Set configuration properties self.collection = pcdm.Collection.from_repository( repo, config.collection_uri) missing_fields = [] try: self.file_path = os.path.join(config.data_dir, config.handler_options['FILE_PATH']) except KeyError: missing_fields.append('FILE_PATH') try: self.metadata_map = os.path.join( config.data_dir, config.handler_options['METADATA_MAP']) except KeyError: missing_fields.append('METADATA_MAP') if missing_fields: field_names = ', '.join(missing_fields) raise ConfigException( f'Missing required HANDLER_OPTIONS in batch configuration: {field_names}' ) if 'RDF_TYPE' in config.handler_options: self.item_rdf_type = URIRef( from_n3(config.handler_options['RDF_TYPE'], nsm=nsm)) else: self.item_rdf_type = None # load the metadata map and metadata file try: with open(self.metadata_map, 'r') as f: self.logger.info( f'Parsing the metadata map in {self.metadata_map}') self.mapping = yaml.safe_load(f) with open(config.batch_file, 'r') as f: self.logger.info(f'Reading metadata file {config.batch_file}') self.rows = [r for r in csv.DictReader(f)] except FileNotFoundError as e: raise ConfigException(e) key_column = get_flagged_column(self.mapping, 'key') if key_column is not None: self.length = len(set([line[key_column] for line in self.rows])) else: self.length = len(self.rows)
def get_semantic_associations(fn=None): if not fn: verified_mappings = get_verified_mappings() semantic_associations = get_dbpedia_pairs_from_mappings( verified_mappings) semantic_associations = [URIRefify(p) for p in semantic_associations] else: semantic_associations = [] with open(fn) as f: # expects a file with one space separated pair of n3 encoded IRIs # per line r = csv.DictReader( f, delimiter=b' ', doublequote=False, escapechar=None, quoting=csv.QUOTE_NONE, ) assert r.fieldnames == ['source', 'target'] for row in r: source = from_n3(row['source'].decode('UTF-8')) target = from_n3(row['target'].decode('UTF-8')) semantic_associations.append((source, target)) return semantic_associations
def lookup(self, id): """Resolve a single id @param id The id @return A RDFlib Graph or None if the ID is not found """ g = ConjunctiveGraph() g.bind("lemon", "http://lemon-model.net/lemon#") g.bind("owl", str(OWL)) conn = sqlite3.connect(self.db) cursor = conn.cursor() cursor.execute( """select subject, property, object from triples where page=?""", (unicode_escape(id),)) rows = cursor.fetchall() if rows: for s, p, o in rows: g.add((from_n3(s), from_n3(p), from_n3(o))) if o.startswith("_:"): self.lookup_blanks(g, o, conn) conn.close() return g else: return None
def test_util_from_n3_escapes(self) -> None: pairs = [ ("\\t", "\t"), ("\\b", "\b"), ("\\n", "\n"), ("\\r", "\r"), ("\\f", "\f"), ('\\"', '"'), ("\\'", "'"), ("\\\\", "\\"), ("\\u00F6", "ö"), ("\\U000000F6", "ö"), ] for escaped, raw in pairs: with self.subTest(f"{escaped} => {raw}"): literal_str = str(util.from_n3(f'"{escaped}"')) self.assertEqual(literal_str, f"{raw}")
def list_values(self, offset, limit, prop): """ Produce a list of all possible values for a particular property @param offset Where to start listing @param limit Number of values to list @param prop The property to list for @return A tuple consisting of a boolean indicating if there are more results and list of values that exist (as N3) """ conn = sqlite3.connect(self.db) cursor = conn.cursor() if not offset: offset = 0 cursor.execute("""SELECT DISTINCT object, obj_label, count(*) FROM triples WHERE property=? AND head=0 GROUP BY oid ORDER BY count(*) DESC LIMIT ? OFFSET ?""", (prop, limit + 1, offset)) row = cursor.fetchone() n = 0 results = [] while n < limit and row: obj, label, count = row n3 = from_n3(obj) if type(n3) == Literal: results.append({'link': obj, 'label': n3.value, 'count': count}) elif type(n3) == URIRef: # u = self.unname(str(n3)) # if u: # s, _ = u if label: results.append({'link': obj, 'label': label, 'count': count}) else: # results.append({'link': obj, 'label': s, # 'count': count}) # else: results.append({'link': obj, 'label': yuzu.displayer.DISPLAYER.apply( str(n3)), 'count': count}) n += 1 row = cursor.fetchone() conn.close() return n == limit, results
def srtsx_body2(r, vars): for v in vars: val = from_n3(r[vars.index(v)]) if isinstance(val, URIRef): yield (" <binding name=\"%s\"><uri>%s</uri></binding>" % (v, str(val))) elif isinstance(val, BNode): yield (" <binding name=\"%s\"><bnode>%s</bnode></binding>" % (v, str(val))) elif val.language: yield (" <binding name=\"%s\"><literal xml:lang=\"%s\">" "%s</literal></binding>" % (v, val.language, str(val))) elif val.datatype: yield(" <binding name=\"%s\"><literal datatype=\"%s\">" "%s</literal></binding>" % (v, val.datatype, str(val))) else: yield(" <binding name=\"%s\"><literal>%s</literal></binding>" % (v, str(val)))
def get_column_value(self, row, column): conf = self.mapping[column] value = row.get(column, None) if value is None: # this is a "dummy" column that is not actually in the # source CSV file but should be generated, either from # a format-string pattern or a static value if 'pattern' in conf: value = conf['pattern'].format(**row) elif 'value' in conf: value = conf['value'] if conf.get('uriref', False): try: return URIRef(from_n3(value, nsm=nsm)) except KeyError: # prefix not found, assume it is not a prefixed form return URIRef(value) else: return value
def srtsj_body2(r, vars): for v in vars: val = from_n3(r[vars.index(v)]) if not val: yield "" if isinstance(val, URIRef): yield (" \"%s\": { \"type\": \"uri\", \"value\": \"%s\" }" % (v, str(val))) elif isinstance(val, BNode): yield (" \"%s\": { \"type\": \"bnode\", \"value\": \"%s\" }" % (v, str(val))) elif val.language: yield (" \"%s\": { \"type\": \"literal\", \"xml:lang\": " "\"%s\", \"value\": \"%s\" }" % (v, val.language, str(val))) elif val.datatype: yield (" \"%s\": { \"type\": \"literal\", \"datatype\": " "\"%s\", \"value\": \"%s\" }" % (v, val.datatype, str(val))) else: yield (" \"%s\": { \"type\": \"literal\", \"value\": \"%s\" }" % (v, str(val)))
def yield_triples(file): total = 0 blocks = [] block_size = 5000 parsed = 0 print('-> starting yielding...') to_read = "" for cnt, line in enumerate(file): try: # do not touch all the lines below. We read every new line, # if we see a triple on multiple lines aka we cant find any matches, we continue to read # and we accumulate the string and test the accumulated string, on a match we continue if to_read == "": triple = SAGE_NTRIPLES_REGEX.findall(line) to_read += line if len(triple) > 0: triple = triple[0] blocks.append((from_n3(triple[0]), from_n3(triple[1]), from_n3(triple[2]))) parsed += 1 to_read = "" else: to_read = to_read.replace('\n', '') else: to_read += line triple = SAGE_NTRIPLES_REGEX.findall(to_read) if len(triple) > 0: triple = triple[0] blocks.append((from_n3(triple[0]), from_n3(triple[1]), from_n3(triple[2]))) parsed += 1 to_read = "" else: to_read = to_read.replace('\n', '') if cnt % block_size == 0: parsed = 0 for t in blocks: total += 1 yield __n3_to_str(t) blocks = [] except Exception as err: print(err) print(line) exit(1) if len(blocks) > 0: for t in blocks: total += 1 yield __n3_to_str(t) print('-> yielded {} triples'.format(total))
def parse_term(term: Union[str, List[str]], nsm: NamespaceManager = None) -> Term: """Parse a raw RDF term or a list of raw RDF Terms into the rdflib format. Args: * term: (List of) RDF Term(s) to parse (in n-triples format). * nsm: Namespace manager used to expand prefixed URIs. Returns: The parsed RDF term in rdflib format. """ # case 1: a single RDf Term if type(term) == str: # the special keyword "none" is interpreted as "ottr:None" if term == "none": return OTTR_NONE # rdflib tends to see SPARQL variables as blank nodes, so we need to handle them separately if term.startswith('?'): return Variable(term[1:]) return from_n3(term, nsm=nsm) else: # Case 2: a list of RDF terms return [parse_term(value, nsm=nsm) for value in term]
def synset(context, offset, graph=None, extras=False, translate=True): """ Return an RDF graph for a synset given an offset value @param context: A WNRDFContext object @param offset: The offset value in the database (Int) @param graph: If not None add to this graph @return The graph passed (or a new graph) containing the triples for this synset or None if the synset was not found """ if graph is None: graph = make_graph() cursor = context.conn.cursor() not_translated = True if translate: c = context.mconn.cursor() c.execute("select internal from wn31r where release=?", (offset,)) row = c.fetchone() if row: offset, = row not_translated = False else: not_translated = True # Read the synset information cursor.execute("select pos, lexdomainid, definition from synsets where synsetid=?", (offset,)) # no index row = cursor.fetchone() if row is None: return None pos, lexdomainid, definition = row if not_translated: synset_uri = synset_name(context, offset, pos.upper()) else: synset_uri = synset_name(context, offset, pos) graph.add((synset_uri, RDF.type, wn_ontology.Synset)) graph.add((synset_uri, wn_ontology.part_of_speech, wn_ontology.term(context.postypes[pos]))) graph.add((synset_uri, wn_ontology.lexical_domain, wn_ontology.term(context.lexdomainid_to_name[lexdomainid]))) graph.add((synset_uri, wn_ontology.gloss, Literal(definition, lang=context.lang))) cursor.execute("select lemma, casedwordid from senses inner join words on senses.synsetid=? and senses.wordid=words.wordid", (offset,)) for lemma, casedwordid in cursor.fetchall(): if casedwordid: cursor.execute("select cased from casedwords where casedwordid=?", (casedwordid,)) cased_lemma, = cursor.fetchone() graph.add((synset_uri, RDFS.label, Literal(cased_lemma, lang=context.lang))) graph.add((synset_uri, wn_ontology.synset_member, entry_name(cased_lemma, pos))) else: graph.add((synset_uri, RDFS.label, Literal(lemma, lang=context.lang))) graph.add((synset_uri, wn_ontology.synset_member, entry_name(lemma, pos))) # Read the phrase type (if it exists) cursor.execute("select phrasetype from phrasetypes where synsetid=?", (offset,)) # unindexed for phrasetype, in cursor.fetchall(): graph.add((synset_uri, wn_ontology.phrase_type, wn_ontology.term(phrasetype))) # Read the samples cursor.execute("select sampleid, sample from samples where synsetid=?", (offset,)) for sampleid, sample in cursor.fetchall(): graph.add((synset_uri, wn_ontology.sample, Literal(sample, lang=context.lang))) # Read the synset links cursor.execute("select synset2id, linkid from semlinks where synset1id=?", (offset,)) for synsetid2, linkid in cursor.fetchall(): cursor.execute("select pos from synsets where synsetid=?", (synsetid2,)) row = cursor.fetchone() if row is None: sys.stderr.write("Synset %s referred to but not found " % synsetid2) else: pos2, = row synset_uri2 = synset_name(context, synsetid2,pos2) graph.add((synset_uri, wn_ontology.term(context.linktypes[linkid]), synset_uri2)) if extras: cursor.execute("select definition from synsets where synsetid=?", (synsetid2,)) def2, = cursor.fetchone() graph.add((synset_uri2, wn_ontology.gloss, Literal(def2, lang=context.lang))) try: cursor.execute("select property, object from synsettriples where synsetid=?",(offset,)) for p, o in cursor.fetchall(): graph.add((synset_uri, URIRef(p), from_n3(o))) except Exception as e: print (e) return graph
def entry(context, lemma, pos, graph=None): """ Return an RDF graph for a lexical entry given a particular lemma string @param context: A WNRDF Context @param lemma: The lemma (case-sensitive!) @param pos: The part-of-speech (as a 1-letter code) @param graph: A graph to add the triples to (or None for a new graph) @return The graph containing the entry's triples or None if the entry was not found """ # First map the lemma to the internal word id if graph is None: graph = make_graph() cursor = context.conn.cursor() if not lemma.islower(): cased_lemma = lemma lemma = lemma.lower() else: cased_lemma = lemma cursor.execute("select * from words where lemma=?", (lemma,)) row = cursor.fetchone() if row is None: return None word_id, _ = row # Add entry description entry_uri = entry_name(cased_lemma, pos) graph.add((entry_uri, RDF.type, lemon.LexicalEntry)) graph.add((entry_uri, wn_ontology.part_of_speech, wn_ontology.term(context.postypes[pos]))) canonical_form_uri = entry_name(cased_lemma, pos, "CanonicalForm") graph.add((entry_uri, lemon.canonicalForm, canonical_form_uri)) graph.add((canonical_form_uri, lemon.writtenRep, Literal(cased_lemma, lang=context.lang))) graph.add((canonical_form_uri, RDF.type, lemon.Form)) # Search for morphological forms cursor.execute("select pos, morphid from morphmaps where wordid=? and pos=?", (word_id, pos)) # partially unindexed other_forms = 1 this_pos_found = False for pos, morphid in cursor.fetchall(): cursor.execute("select morph from morphs where morphid=?", (morphid,)) # unindexed for morph, in cursor.fetchall(): other_form_uri = entry_name(cased_lemma, pos, "Form-%d" % other_forms) graph.add((entry_uri, lemon.otherForm, other_form_uri)) graph.add((other_form_uri, RDF.type, lemon.Form)) graph.add((other_form_uri, lemon.writtenRep, Literal(morph, lang=context.lang))) other_forms += 1 # Find senses if cased_lemma.islower(): #cursor.execute("select * from senses where wordid=? and casedwordid is NULL", (word_id,)) cursor.execute("select * from senses where wordid=?", (word_id,)) else: cursor.execute("select casedwordid from casedwords where cased=?",(cased_lemma,)) row = cursor.fetchone() if row is None: return None casedwordid, = row cursor.execute("select * from senses where casedwordid=?", (casedwordid,)) for _, casedwordid, synsetid, senseid, sensenum, lexid, tagcount, old_sensekey, sensekey in cursor.fetchall(): # NB. This could also be achieved by querying "casedwordid is NULL" however # this is significantly slower, so we filter in Python checking we return cased # forms only for cased lemmas if cased_lemma.islower() == bool(casedwordid): continue if sensekey[-1] == pos: this_pos_found = True _, sensekey2 = sensekey.split('#') sense_uri = entry_name(cased_lemma, pos, sensekey2) graph.add((entry_uri, lemon.sense, sense_uri)) graph.add((sense_uri, RDF.type, lemon.LexicalSense)) graph.add((sense_uri, lemon.reference, synset_name(context, synsetid, pos))) graph.add((sense_uri, wn_ontology.sense_number, Literal(sensenum))) graph.add((sense_uri, wn_ontology.tag_count, Literal(tagcount))) graph.add((sense_uri, wn_ontology.lex_id, Literal(lexid))) graph.add((sense_uri, wn_ontology.old_sense_key, Literal(old_sensekey))) # Now adjective positions cursor.execute("select position from adjpositions where synsetid=? and wordid=?", (synsetid, word_id)) rows = cursor.fetchall() for position, in rows: graph.add((sense_uri, wn_ontology.adjposition, URIRef(wn_ontology.term(quote_plus(context.adjposition_names[position]))))) # Add definition also to sense cursor.execute("select definition from synsets where synsetid=?", (synsetid,)) for definition, in cursor.fetchall(): graph.add((sense_uri, wn_ontology.gloss, Literal(definition, lang=context.lang))) # Sense links cursor.execute("select senseid2, linkid from lexlinks where senseid1=?", (senseid,)) for senseid2, linkid in cursor.fetchall(): cursor.execute("select sensekey from senses where senseid=?", (senseid2,)) sensekey3, = cursor.fetchone() sense2_lemma, sense2_key = sensekey3.split('#') pos2 = sensekey3[-1] sense_uri2 = entry_name(sense2_lemma, pos2, sense2_key) graph.add((sense_uri, wn_ontology.term(context.linktypes[linkid]), sense_uri2)) # Verb frames (maybe only if pos=='v'?) cursor.execute("select sentenceid from vframesentencemaps where synsetid=? and wordid=?", (synsetid, word_id)) for sentenceid, in cursor.fetchall(): graph.add((sense_uri, wn_ontology.verb_frame_sentence, Literal(context.vframesentences[sentenceid], lang=context.lang))) # Sense tags cursor.execute("select position, senseid from sensetags inner join taggedtexts on sensetags.sensetagid=taggedtexts.sensetagid where new_sensekey=?",(sensekey,)) # unindexed for position, senseid in cursor.fetchall(): cursor.execute("select sensekey from senses where senseid=?",(senseid,)) for sensekey, in cursor.fetchall(): if position: comp_uri = entry_name(sensekey[0:sensekey.index('#')].replace("_"," "),sensekey[-1],'Component-' + str(position+1)) graph.add((sense_uri, wn_ontology.sense_tag, comp_uri)) # LexVo Link graph.add((sense_uri, OWL.sameAs, translate_to_lexvo(old_sensekey, pos))) if not this_pos_found: return None if pos == "p": words = lemma.split(" ") node = BNode() comp1 = entry_name(lemma, pos, "Component-1") graph.add((entry_uri, lemon.decomposition, node)) graph.add((node, RDF.first, comp1)) graph.add((comp1, RDFS.label, Literal(words[0], lang=context.lang))) graph.add((comp1, RDF.type, lemon.Component)) for idx in range(1,len(words)): node2 = BNode() graph.add((node, RDF.rest, node2)) node = node2 comp_uri = entry_name(lemma, pos, "Component-" + str(idx + 1)) graph.add((node, RDF.first, comp_uri)) graph.add((comp_uri, RDFS.label, Literal(words[idx], lang=context.lang))) graph.add((comp_uri, RDF.type, lemon.Component)) graph.add((node, RDF.rest, RDF.nil)) try: cursor.execute("select fragment, property, object from entrytriples where lemma=?",(quote_plus(lemma)+"-"+pos,)) for f, p, o in cursor.fetchall(): graph.add((entry_name(lemma,pos,f), from_n3(p), from_n3(o))) except: pass return graph
def test_util_from_n3_expectliteralwithtrailingbackslash(self): s = '"trailing\\\\"^^<http://www.w3.org/2001/XMLSchema#string>' res = util.from_n3(s) self.assert_(res, Literal('trailing\\', datatype=XSD['string'])) self.assert_(res.n3(), s)
def test_util_from_n3_expecturiref(self): s = '<http://example.org/schema>' res = util.from_n3(s, default=None, backend=None) self.assert_(isinstance(res, URIRef))
def test_util_from_n3_expectliteralandlangdtype(self): s = '"michel"@fr^^xsd:fr' res = util.from_n3(s, default=None, backend=None) self.assertTrue(isinstance(res, Literal)) self.assertEqual(res, Literal('michel', datatype=XSD['fr']))
def test_util_from_n3_expecturiref(self): s = '<http://example.org/schema>' res = util.from_n3(s, default=None, backend=None) self.assertTrue(isinstance(res, URIRef))
def test_util_from_n3_expectliteralwithescapedquote(self): s = '"\\""' res = util.from_n3(s, default=None, backend=None) self.assert_(res, Literal('\\"', lang='en'))
def test_util_from_n3_expectliteralmultiline(self): s = '"""multi\nline\nstring"""@en' res = util.from_n3(s, default=None, backend=None) self.assert_(res, Literal('multi\nline\nstring', lang='en'))
def test_util_from_n3_expectliteralwithdatatypefromint(self): s = '42' res = util.from_n3(s) self.assertEqual(res, Literal(42))
def test_util_from_n3_expectliteralanddtype(self): s = '"true"^^xsd:boolean' res = util.from_n3(s, default=None, backend=None) self.assertTrue(res.eq(Literal('true', datatype='xsd:boolean')))
def test_util_from_n3_expectliteralandlangdtype(self): s = '"michel"@fr^^xsd:fr' res = util.from_n3(s, default=None, backend=None) self.assert_(isinstance(res, Literal)) self.assertEqual(res, Literal('michel', datatype=URIRef('xsd:fr')))
def test_util_from_n3_expectliteralandlang(self): s = '"michel"@fr' res = util.from_n3(s, default=None, backend=None) self.assert_(isinstance(res, Literal))
def test_util_from_n3_expectquotedgraph(self): s = '{<http://example.com/schema>}' res = util.from_n3(s, default=None, backend="IOMemory") self.assertTrue(isinstance(res, QuotedGraph))
def main( resdir, sparql_endpoint, max_queries, clustering_variant, fusion_methods, timeout, max_results, max_target_candidates_per_gp, batch_predict, drop_bad_uris, **_ # gulp remaining kwargs ): from gp_query import calibrate_query_timeout from serialization import load_results from serialization import find_last_result from cluster import cluster_gps_to_reduce_queries from gp_learner import init_workers # init workers init_workers() sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint) timeout = timeout if timeout > 0 else calibrate_query_timeout(sparql) # load model last_res = find_last_result() if not last_res: logger.error('cannot find fully trained model in %s', resdir) sys.exit(1) result_patterns, coverage_counts, gtp_scores = load_results(last_res) gps = [gp for gp, _ in result_patterns] gps = cluster_gps_to_reduce_queries( gps, max_queries, gtp_scores, clustering_variant) processed = 0 start = time.time() batch_size = config.BATCH_SIZE if batch_predict else 1 # main loop for lines in chunker(sys.stdin, batch_size): batch = [] for line in lines: line = line.strip() if not line: continue if drop_bad_uris: # noinspection PyBroadException try: source = from_n3(line) utils.curify(source) except Exception: logger.warning( 'Warning: Could not curify URI %s! Skip.', line) continue if line[0] not in '<"': logger.error( 'expected inputs to start with < or ", but got: %s', line) sys.exit(1) source = from_n3(line) batch.append(source) batch = list(OrderedDict.fromkeys(batch)) if len(batch) == 0: pass elif len(batch) == 1: res = predict( sparql, timeout, gps, batch[0], fusion_methods, max_results, max_target_candidates_per_gp ) print(json.dumps(res)) logger.info( 'Predicted %d target candidates for %s', res['orig_result_length'], res['source'] ) else: res = multi_predict( sparql, timeout, gps, batch, fusion_methods, max_results, max_target_candidates_per_gp ) for r in res: print(json.dumps(r)) logger.info('\n'.join([ 'Predicted %d target candidates for %s' % ( r['orig_result_length'], r['source'] ) for r in res ])) processed += len(batch) logger.info( 'Have processed %d URIs now. Took %s sec', processed, time.time()-start)
def fromUnicode(self, str): value = from_n3(str) self.validate(value) return value
def test_util_from_n3_expectbnode(self): s = "_:michel" res = util.from_n3(s, default=None, backend=None) self.assertTrue(isinstance(res, BNode))
def test_util_from_n3_expectgraph(self): s = '[http://example.com/schema]' res = util.from_n3(s, default=None, backend="IOMemory") self.assert_(isinstance(res, Graph))
def test_util_from_n3_expectliteralandlang(self): s = '"michel"@fr' res = util.from_n3(s, default=None, backend=None) self.assertTrue(isinstance(res, Literal))
def from_n3(string): term = util.from_n3(string) if isinstance(term, Literal): if term.datatype is None and term.language is None: term = Literal(term, datatype=URIRef('xsd:string')) return term
def test_util_from_n3_expectliteralanddtype(self): s = '"true"^^xsd:boolean' res = util.from_n3(s, default=None, backend=None) self.assertTrue(res.eq(Literal('true', datatype=XSD['boolean'])))
def test_util_from_n3_expectliteralwithescapedquote(self): s = '"\\""' res = util.from_n3(s, default=None, backend=None) self.assertTrue(res, Literal('\\"', lang='en'))
def test_util_from_n3_expectliteralmultiline(self): s = '"""multi\nline\nstring"""@en' res = util.from_n3(s, default=None, backend=None) self.assertTrue(res, Literal('multi\nline\nstring', lang='en'))
def test_util_from_n3_sisnonenodefault(self): s = None default = None res = util.from_n3(s, default=default, backend=None) self.assert_(res == default)
def test_util_from_n3_expectliteralwithtrailingbackslash(self): s = '"trailing\\\\"^^<http://www.w3.org/2001/XMLSchema#string>' res = util.from_n3(s) self.assertTrue(res, Literal('trailing\\', datatype=XSD['string'])) self.assertTrue(res.n3(), s)
def test_util_from_n3_sisnonewithdefault(self): s = None default = "TestofDefault" res = util.from_n3(s, default=default, backend=None) self.assert_(res == default)
def test_util_from_n3_expectgraph(self): s = '[<http://example.com/schema>]' res = util.from_n3(s, default=None, backend="IOMemory") self.assertTrue(isinstance(res, Graph))
def test_util_from_n3_expectbnode(self): s = "_:michel" res = util.from_n3(s, default=None, backend=None) self.assert_(isinstance(res, BNode))