def test_get_iri(self): testgraph = rdflib.Graph() testgraph.parse(TESTFILE, format="turtle") location = "Αθήνα" iri = gn.get_iri(location, testgraph) self.assertTrue(isinstance(iri, rdflib.URIRef)) realiri = rdflib.URIRef("http://sws.geonames.org/264371/") self.assertEqual(iri, realiri)
def main(argv): args = args_process(argv) soup = BeautifulSoup(open(args.inputfile),'lxml') currentbase = Graph() currentbase.load(args.current, format='turtle') g = Graph() for item in soup.find_all('result'): url = shrink_url(item.url.string.strip()) subject = URIRef(url) #TODO: Check that with URL as subject, deduplication is not # needed if args.verbose: print "Processing post " + url if is_duplicate(currentbase,subject) or is_duplicate(g,subject): if args.verbose: print url +" identified as duplicate, skipping..." continue #URL g.add((subject,ns.schema.url,URIRef(url))) #Source g.add((subject,ns.schema.source,Literal("Indeed"))) #Title g.add((subject,ns.schema.jobTitle,Literal(item.jobtitle.string.strip()))) #Description g.add((subject,ns.schema.description,Literal(get_description(url)))) #PubDate date = dtp.parse(item.date.string) g.add((subject,ns.schema.datePosted, Literal(date.isoformat(),datatype=XSD.Date))) #hiringOrganization try: g.add((subject,ns.schema.hiringOrganization, Literal(item.company.string.strip()))) except AttributeError: if args.verbose: print ("%s has no company in the source data" % subject) #location location = item.formattedlocation.string.strip() g.add((subject,ns.schema.jobLocation,Literal(location))) try: #TODO changing point when separating geonames subset from current base if gn.is_inside(location,currentbase): if args.verbose: print ("%s already linked to geonames, reusing..." % location) lociri = gn.get_iri(location,currentbase) g.add((subject,ns.edsa.Location,lociri)) elif gn.is_inside(location,g): if args.verbose: print ("%s already linked to geonames, reusing..." % location) lociri = gn.get_iri(location,g) g.add((subject,ns.edsa.Location,lociri)) else: tup = gn.find_location(item.formattedlocation.string.strip()) g.add((subject,ns.edsa.Location,URIRef(tup[0]))) g += tup[1] except gn.NotFoundException as e: #TODO: Redirect to an error file print("%s in subject %s" % (e,subject)) print("problematic location %s" % item.formattedlocation.string) currentbase += g g.serialize(destination=args.outputfile, format='turtle') currentbase.serialize(destination=args.current, format='turtle')