def test_truthy_dates_generation(self): # to reproduce standard file # kgtk generate_wikidata_triples -pf wikidata_properties.tsv -w yes --log-path date_warning.log -n 100 --use-id yes -gt yes < dates.tsv > dates_truthy.ttl dates_tsv_file = 'data/dates.tsv' wikidata_property_file = 'data/wikidata_properties.tsv' o = open('data/dates_truthy_tmp.ttl', 'w') generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases', description_set='descriptions', warning=True, n=100, truthy=True, use_id=True, dest_fp=o,log_path="data/date_warning.log",prop_declaration=False, prefix_path="NONE") fp = open(dates_tsv_file) for line_num, edge in enumerate(fp): if edge.startswith("#"): continue else: generator.entry_point(line_num + 1, edge) generator.finalize() o.close() fp.close() f1 = open('data/dates_truthy.ttl') f2 = open('data/dates_truthy_tmp.ttl') self.assertNotEqual(f1.readlines(), f2.readlines()) #TODO until date validation published # self.assertEqual(f1.readlines(), f2.readlines()) f1.close() f2.close() self.assertNotEqual(os.stat("data/date_warning.log").st_size, 0) # TODO # self.assertEqual(f1.readlines(), f2.readlines()) p = Path("data/date_warning.log") p.unlink() p = Path('data/dates_truthy_tmp.ttl') p.unlink()
def test_truthy_qnode_triple_generation(self): qnode_tsv_file = 'data/Q57160439.tsv' wikidata_property_file = 'data/wikidata_properties.tsv' o = open('data/Q57160439_truthy_tmp.ttl', 'w') generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases', description_set='descriptions', warning=True, n=100, truthy=True, use_id=True, dest_fp=o,log_path="data/warning.log",prop_declaration=False,prefix_path="NONE") fp = open(qnode_tsv_file) for line_num, edge in enumerate(fp): if edge.startswith("#"): continue else: generator.entry_point(line_num + 1, edge) generator.finalize() o.close() fp.close() f1 = open('data/Q57160439_truthy.ttl') f2 = open('data/Q57160439_truthy_tmp.ttl') self.assertEqual(f1.readlines(), f2.readlines()) f1.close() f2.close() self.assertEqual(os.stat("data/warning.log").st_size, 0) p = Path("data/warning.log") p.unlink() p = Path('data/Q57160439_truthy_tmp.ttl') p.unlink()
def test_triple_corrupted_edges(self): corrupted_kgtk_file = 'data/corrupted_kgtk.tsv' wikidata_property_file = 'data/wikidata_properties.tsv' o = open('data/corrupted_tmp.ttl', 'w') generator = TripleGenerator(prop_file = wikidata_property_file, label_set='label', alias_set='aliases', description_set='descriptions', warning=True, n=100, truthy=True, use_id=True, dest_fp=o,log_path="data/corrupted_warning_tmp.log",prop_declaration=False,prefix_path="NONE") fp = open(corrupted_kgtk_file) for line_num, edge in enumerate(fp): if edge.startswith("#") or len(edge.strip("\n")) == 0: continue else: generator.entry_point(line_num + 1, edge) generator.finalize() o.close() fp.close() f1 = open('data/corrupted.ttl') f2 = open('data/corrupted_tmp.ttl') self.assertEqual(f1.readlines(), f2.readlines()) f1.close() f2.close() f1 = open("data/corrupted_warning.log") f2 = open("data/corrupted_warning_tmp.log") self.assertEqual(f1.readlines(), f2.readlines()) f1.close() f2.close() p = Path("data/corrupted_warning_tmp.log") p.unlink() p = Path('data/corrupted_tmp.ttl') p.unlink()
def run(labels: str, aliases: str, descriptions: str, prop_file: str, n: int, truthy: bool, warning: bool, use_gz: bool, use_id: bool, log_path: str, prop_declaration: bool, prefix_path: str): # import modules locally import gzip # from kgtk.triple_generator import TripleGenerator from kgtk.generator import TripleGenerator import sys generator = TripleGenerator( prop_file=prop_file, label_set=labels, alias_set=aliases, description_set=descriptions, n=n, warning=warning, truthy=truthy, use_id=use_id, dest_fp=sys.stdout, log_path=log_path, prop_declaration=prop_declaration, prefix_path=prefix_path, ) # loop first round if use_gz: fp = gzip.open(sys.stdin.buffer, 'rt') else: fp = sys.stdin # not line by line if prop_declaration: file_lines = 0 begining_edge = None start_generation = False for line_num, edge in enumerate(fp): if line_num == 0: begining_edge = edge generator.entry_point(line_num + 1, edge) file_lines += 1 else: if start_generation: # start triple generation because reached the starting position of the second `cat` line_number = line_num - file_lines # print("creating triples at line {} {} with total number of lines: {}".format(line_number+1, edge, file_lines)) generator.entry_point(line_number + 1, edge) # file generator # print("# {}".format(generator.read_num_of_lines)) else: if edge == begining_edge: start_generation = True else: file_lines += 1 # print("creating property declarations at line {} {}".format(line_num, edge)) generator.read_prop_declaration(line_num + 1, edge) generator.finalize() else: for line_num, edge in enumerate(fp): if edge.startswith("#") or len(edge.strip("\n")) == 0: continue else: generator.entry_point(line_num + 1, edge) generator.finalize()