def usf_node(cueOrResponse: str, pos: str, other: dict = {}) -> KgNode: return KgNode.legacy( datasource=USF_DATASOURCE_ID, id=f'{USF_NAMESPACE}:{quote("%(cueOrResponse)s-%(pos)s" % locals())}', label=cueOrResponse, pos=pos, other=None if len(other) == 0 else other)
def __convert_normalized_arg_to_node(self, normalized_arg): # Create nodes in a custom namespace. # Will do sameAs WordNet or Wikipedia nodes in the transform instead of reusing their id's here. # Don't include metadata as "other", since the data set contains multiple normalized args with different metadata, # which violates our duplicate node id checks. metadata = normalized_arg.get("metadata", {}) if "synset" in metadata: synset = metadata["synset"] assert synset.startswith("wn.") word_net_id = WordNetId.parse(synset[len("wn."):]) pos = word_net_id.pos else: pos = None label = normalized_arg["normalized"] id_ = f"{self.__DATASOURCE}:{quote(label)}" if pos is not None: id_ += ":" + pos return \ KgNode.legacy( datasource=self.__DATASOURCE, id=id_, label=label, pos=pos, # other=normalized_arg.get("metadata") )
def __webchild_node(self, *, ssid: str, word: str) -> KgNode: return KgNode.legacy( datasource=self.__DATASOURCE_ID, id=self.__webchild_nid(ssid), label=word, # All subjects/objects are nouns in WebChild part-whole pos="n", )
def sentic_node(*, id: str, label: str = None, sentic_type: str) -> KgNode: if label is None: label = id return KgNode.legacy( datasource=SENTIC_DATASOURCE_ID, id=sentic_id(id, sentic_type), label=label, # other={SENTIC_TYPE_KEY: sentic_type}, )
def test_map_unqualified_node(concept_net_mapper): edges = tuple( concept_net_mapper.map( KgNode.legacy(id="a", datasource="test", label="a"))) assert len(edges) == 1 edge = edges[0] assert edge.subject == "a" assert edge.object == "/c/en/a" assert edge.predicate == mowgli_predicates.SAME_AS assert edge.source_ids == ("test", )
def __init__(self, *, label: str, sub_class_of: Optional[Tuple[URIRef, ...]], uri: URIRef): self.label = label self.sub_class_of = sub_class_of self.uri = uri self.node = \ KgNode.legacy( datasource=FoodOnTransformer._DATASOURCE, id="foodon:" + str(uri)[len(self._URI_PREFIX):], label=label ) self.node_yielded = False
def _generator(): nid_counter = count(1) while True: nodes = tuple( KgNode.legacy(datasource='test_datasource', id=f'test_node_{next(nid_counter)}', label='test node') for _ in range(2)) yield from nodes yield KgEdge.legacy(datasource='test_datasource', object=nodes[1].id, predicate='test_predicate', subject=nodes[0].id)
def test_map_node_with_pos(concept_net_mapper): edges = tuple( concept_net_mapper.map( KgNode.legacy(id="nid30", datasource="test", label="30", pos="a"))) assert len(edges) == 1 edge = edges[0] assert edge.subject == "nid30" assert edge.object == "/c/en/30/a/wn" assert edge.predicate == mowgli_predicates.SAME_AS assert edge.source_ids == ("test", )
def test_swow_node(): node = swow_node(word="test response", response_counts=Counter(R1=3, R2=2, R3=0)) expected_node = KgNode.legacy( datasource=SWOW_DATASOURCE_ID, id=f'{SWOW_NAMESPACE}:{quote("test response")}', label="test response", other={"response_counts": { "R1": 3, "R2": 2, "R3": 0 }}, ) assert node == expected_node
def swow_node(*, word: str, response_counts: Counter) -> KgNode: """ Create a cskg node from a SWOW cue or response. :param word: a SWOW cue or response :param response_counts: counts of responses to this word """ assert all(k in SwowResponseType.__members__ for k in response_counts.keys()) return KgNode.legacy( datasource=SWOW_DATASOURCE_ID, id=swow_node_id(word), label=word, other={ "response_counts": { rt: response_counts[rt] for rt in SwowResponseType.__members__.keys() } }, )
def __transform(self, *, nodes_csv_file: TextIO) -> Generator[KgNode, None, None]: csv_reader = csv.DictReader(nodes_csv_file, delimiter="\t", quoting=csv.QUOTE_NONE) for csv_row_i, csv_row in enumerate(csv_reader): try: yield \ KgNode.legacy( aliases=self._get_optional_column(csv_row, "aliases"), datasource=self._get_required_column(csv_row, "datasource"), id=self._get_required_column(csv_row, "id"), label=self._get_optional_column(csv_row, "label"), other=self._get_optional_column(csv_row, "other"), pos=self._get_optional_column(csv_row, "pos"), ) except ValueError as e: self._logger.warning("CSKG nodes CSV row %d %s: %s", csv_row_i, e, csv_row)
def __parse_arg(self, *, arg: str, provenance: str, type_: str) -> KgNode: # Put the type in the id in case words are reused if type_ != "Thing": word_net_type = type_.rsplit('_', 1) assert len(word_net_type[1]) >= 2 type_word_net_id = WordNetId(word=word_net_type[0], pos=word_net_type[1][0], offset=int(word_net_type[1][1:])) else: type_word_net_id = None node = \ KgNode.legacy( datasource=self.__DATASOURCE, id=f"{self.__DATASOURCE}:{type_}:{quote(arg)}", label=arg, # Assume the part of speech of the arg is the same as the part of speech of the type pos=type_word_net_id.pos if type_word_net_id is not None else None, other={"provenance": provenance, "type": type_} ) return node, type_word_net_id
def test_write_node(pipeline_storage): test_node = KgNode.legacy( datasource='test_datasource', id='test_nid', label='Test KgNode', aliases=('t-node', 'KgNode Test'), # other={'datasets': ['test_dataset', 'other_test_dataset']}, pos='N') with CskgCsvLoader().open(pipeline_storage) as loader: loader.load_kg_node(test_node) # 20200310 MG: duplicate removal has been moved to the PipelineWrapper # loader.load_kg_node(test_node) expected_node_text = ( _EXPECTED_NODE_HEADER + '\n' + 'test_nid\tTest KgNode\tt-node KgNode Test\tN\ttest_datasource\t\n') with open(pipeline_storage.loaded_data_dir_path / "edges.csv") as f: assert f.read() == _EXPECTED_EDGE_HEADER + '\n' with open(pipeline_storage.loaded_data_dir_path / "nodes.csv") as f: assert f.read() == expected_node_text
def test_eat_tranform(): test_file_dir = pathlib.Path(__file__).parent.absolute() test_file_path = os.path.join(test_file_dir, 'sample_eat100.xml') transformer = EatTransformer() nodes, edges = set(), set() for result in transformer.transform(xml_file_path=test_file_path): if isinstance(result, KgNode): nodes.add(result) elif isinstance(result, KgEdge): edges.add(result) expected_stimulus_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + stim_word, label=stim_word) for stim_word in [ 'SPECIAL', 'SET' ]) expected_response_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + response_word, label=response_word) for response_word in [ 'TRAIN', 'PARTICULAR', 'EXTRA', 'ORDINARY', 'CASE', 'PERSON', 'BEER', 'CAR', 'CONSTABLE', 'TELEVISION', 'UP', 'OUT', 'TO', 'DOWN', 'GAME', 'GROUP', 'T.V.', 'TEA' ]) expected_nodes = expected_stimulus_nodes | expected_response_nodes expected_edges = set( KgEdge.legacy(datasource="eat", object="eat:" + stim_node, predicate="cn:RelatedTo", subject="eat:" + response_node, weight=response_weight) for (stim_node, response_node, response_weight) in [ ('SPECIAL', 'TRAIN', 0.07), ('SPECIAL', 'PARTICULAR', 0.05), ('SPECIAL', 'EXTRA', 0.04), ('SPECIAL', 'ORDINARY', 0.04), ('SPECIAL', 'CASE', 0.03), ('SPECIAL', 'PERSON', 0.03), ('SPECIAL', 'BEER', 0.02), ('SPECIAL', 'CAR', 0.02), ('SPECIAL', 'CONSTABLE', 0.02), ('SET', 'TELEVISION', 0.06), ('SET', 'UP', 0.05), ('SET', 'OUT', 0.04), ('SET', 'TO', 0.04), ('SET', 'DOWN', 0.03), ('SET', 'GAME', 0.03), ('SET', 'GROUP', 0.03), ('SET', 'T.V.', 0.03), ('SET', 'TEA', 0.03) ]) assert nodes == expected_nodes assert edges == expected_edges
def node(): return KgNode.legacy(id="testid", label="test label", pos="n", datasource="test", other={"test": 1})
def node() -> KgNode: return KgNode.legacy(id="testid", label="test label", pos="n", datasource="test")
class MockPipeline(_Pipeline): def __init__(self, node_edge_sequence: Tuple[Union[KgNode, KgEdge], ...]): _Pipeline.__init__(self, extractor=NopExtractor(), id=DATASOURCE, transformer=MockTransformer(node_edge_sequence)) def run(node_edge_sequence: Tuple[Union[KgNode, KgEdge], ...], pipeline_storage: PipelineStorage): return PipelineWrapper(MockPipeline(node_edge_sequence), pipeline_storage).run() SUBJECT_NODE = KgNode.legacy(id="testid", label="test label", pos="n", datasource=DATASOURCE) EXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid", label="test label", pos="n", datasource=DATASOURCE) INEXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid", label="test label variation", pos="n", datasource=DATASOURCE) OBJECT_NODE = KgNode.legacy(id="testobject", label="test object", pos="n", datasource=DATASOURCE) EDGE = KgEdge.legacy(subject=SUBJECT_NODE.id, object=OBJECT_NODE.id,