def transform_for_linked_data(edge): """ Modify an edge (assertion) in place to contain values that are appropriate for a Linked Data API. Although this code isn't actually responsible for what an API returns (see the conceptnet-web repository for that), it helps to deal with what edge dictionaries should contain here. The relevant changes are: - Remove the 'features' list - Rename 'uri' to '@id' - Make 'start', 'end', and 'rel' into dictionaries with an '@id' and 'label', removing the separate 'surfaceStart' and 'surfaceEnd' attributes - All dictionaries should have an '@id'. For the edge itself, it's the URI. Without this, we get RDF blank nodes, which are awful. - Set '@type' on objects representing edges and sources. (Nodes get their @type from the `ld_node` function.) """ if 'features' in edge: del edge['features'] for source in edge['sources']: conj = conjunction_uri(*sorted(source.values())) source['@id'] = conj source['@type'] = 'Source' edge['@id'] = edge['uri'] del edge['uri'] edge['@type'] = 'Edge' start_uri = edge['start'] end_uri = edge['end'] rel_uri = edge['rel'] start_label = edge.get('surfaceStart') end_label = edge.get('surfaceEnd') del edge['surfaceStart'] del edge['surfaceEnd'] edge['start'] = ld_node(start_uri, start_label) edge['end'] = ld_node(end_uri, end_label) edge['rel'] = ld_node(rel_uri, None) if 'other' in edge: # TODO: Find out when we use this, or remove it if we don't use it if edge['other'] == start_uri: edge['other'] = edge['start'] elif edge['other'] == end_uri: edge['other'] = edge['end'] else: edge['rel'] = ld_node(rel_uri, None) return edge
def _make_assertion(line_group): """ When a generator of tab-separated lines has been grouped by their assertion URI, this function takes all the lines with the same URI and makes a single assertion out of them. """ lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None # FIXME: the steps leading up to this produce URIs that can differ based # on word senses. These don't get merged together, but they should. uri, rel, start, end, _ = lines[0].split('\t') if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge( rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text, )
def make_assertion(line_group): lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None # FIXME: the steps leading up to this produce URIs that can differ based # on word senses. These don't get merged together, but they should. uri, rel, start, end, _ = lines[0].split('\t') # We can't distinguish word senses well enough yet, so only keep them # up to the part of speech start = uri_prefix(start, 4) end = uri_prefix(end, 4) if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge(rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text)
def make_assertion(line_group): lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None # FIXME: the steps leading up to this produce URIs that can differ based # on word senses. These don't get merged together, but they should. uri, rel, start, end, _ = lines[0].split('\t') if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge( rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text, )
def make_assertion(line_group): lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None uri, rel, start, end, _ = lines[0].split('\t') # We can't distinguish word senses well enough yet, so only keep them # up to the part of speech start = uri_prefix(start, 4) end = uri_prefix(end, 4) if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge( rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text )
def make_edge(rel, start, end, dataset, license, sources, context='/ctx/all', surfaceText=None, weight=1.0): """ Take in the information representing an edge (a justified assertion), and output that edge in dictionary form. >>> e = make_edge(rel='/r/HasProperty', ... start='/c/en/fire', ... end='/c/en/hot', ... dataset='/d/conceptnet/4/en', ... license=Licenses.cc_attribution, ... sources='/and/[/.../]', ... surfaceText='[[Fire]] is [[hot]]', ... weight=1.0) >>> pprint(e) {'context': '/ctx/all', 'dataset': '/d/conceptnet/4/en', 'end': '/c/en/hot', 'features': ['/c/en/fire /r/HasProperty -', '/c/en/fire - /c/en/hot', '- /r/HasProperty /c/en/hot'], 'id': '/e/ee13e234ee835eabfcf7c906b358cc2229366b42', 'license': '/l/CC/By', 'rel': '/r/HasProperty', 'source_uri': '/and/[/.../]', 'sources': ['/...'], 'start': '/c/en/fire', 'surfaceText': '[[Fire]] is [[hot]]', 'uri': '/a/[/r/HasProperty/,/c/en/fire/,/c/en/hot/]', 'weight': 1.0} """ features = [ "%s %s -" % (start, rel), "%s - %s" % (start, end), "- %s %s" % (rel, end) ] uri = assertion_uri(rel, start, end) if isinstance(sources, list): source_tree = conjunction_uri(*sources) source_list = sources else: source_tree = sources source_list = parse_possible_compound_uri('or', sources) separate_source_lists = [ parse_possible_compound_uri('and', source) for source in source_list ] flat_sources = [inner for outer in separate_source_lists for inner in outer] flat_sources = sorted(set(flat_sources)) # Generate a unique ID for the edge. This is the only opaque ID # that appears in ConceptNet objects. You can use it as a # pseudo-random sort order over edges. edge_unique_data = [uri, context, source_tree] edge_unique = ' '.join(edge_unique_data).encode('utf-8') id = '/e/'+sha1(edge_unique).hexdigest() obj = { 'id': id, 'uri': uri, 'rel': rel, 'start': start, 'end': end, 'context': context, 'dataset': dataset, 'sources': flat_sources, 'source_uri': source_tree, 'features': features, 'license': license, 'weight': weight, 'surfaceText': surfaceText } return obj