def main(argv): #TODO: This an RSS, there should be programatical ways to have an order to limit # duplication args = args_process(argv) soup = BeautifulSoup(open(args.inputfile),'lxml') currentbase = Graph() currentbase.load(args.current, format='turtle') g = Graph() c = 0 for item in soup.find_all('item'): c += 1 subject = URIRef("http://www.edsa-project.eu/jobs/StackOverflow/"+str(c)) #URL url = Literal(item.guid.string) if args.verbose: print "Processing post " + url if is_duplicate(currentbase,url): if args.verbose: print url +" identified as duplicate, skipping..." continue g.add((subject,ns.schema.url,url)) #Title g.add((subject,ns.schema.jobTitle,Literal(item.title.string))) #Description g.add((subject,ns.schema.description,Literal(item.description.string))) #PubDate date = dtp.parse(item.pubdate.string) g.add((subject,ns.schema.datePosted, Literal(date.isoformat(),datatype=XSD.Date))) for org in item.find_all('a10:name'): #hiringOrganization #TODO: Service to OpenCorporates to entity matching # Low priority, maybe can be done with SILK later g.add((subject,ns.schema.hiringOrganization,Literal(org.string))) for cat in item.find_all('category'): #skills skill = URIRef("http://www.edsa-project.eu/skill/"+cat.string) g.add((subject,ns.edsa.requiresSkill,skill)) g.add((skill,ns.edsa.lexicalValue,Literal(cat.string))) if item.location is not None: #location g.add((subject,ns.schema.jobLocation,Literal(item.location.string))) try: tup = gn.find_location(item.location.string) g.add((subject,ns.edsa.Location,URIRef(tup[0]))) g += tup[1] except gn.NotFoundException as e: #TODO: Redirect to an error file print("%s in subject %s" % (e,subject)) print("problematic location %s" % item.location.string) currentbase += g g.serialize(destination=args.outputfile, format='turtle') currentbase.serialize(destination=args.current, format='turtle')
def test_find_location_simple(self): location = "Southampton, UK" tup = gn.find_location(location) self.assertTrue(isinstance(tup[0], rdflib.URIRef)) self.assertTrue(isinstance(tup[1], rdflib.Graph)) # At least we brought the correct name askplace = prepareQuery( """ ASK { ?iri gn:name ?name } """, initNs={"gn": ns.geonames}, ) self.assertTrue(tup[1].query(askplace, initBindings={"name": rdflib.Literal("Southampton")}))
def test_find_location_utf8(self): location = "Αθήνα, GR" tup = gn.find_location(location) self.assertTrue(isinstance(tup[0], rdflib.URIRef)) self.assertTrue(isinstance(tup[1], rdflib.Graph)) # At least we brought the correct name and country askplace = prepareQuery( """ ASK { ?iri gn:name ?name } """, initNs={"gn": ns.geonames}, ) self.assertTrue(tup[1].query(askplace, initBindings={"name": rdflib.Literal("Athens")})) def test_find_location_unknown(self): location = "Unknownlandidfgdfg" with assertRaises(gn.NotFoundException): tup = gn.find_location(location)
def test_find_location_unknown(self): location = "Unknownlandidfgdfg" with assertRaises(gn.NotFoundException): tup = gn.find_location(location)
def main(argv): args = args_process(argv) soup = BeautifulSoup(open(args.inputfile),'lxml') currentbase = Graph() currentbase.load(args.current, format='turtle') g = Graph() for item in soup.find_all('result'): url = shrink_url(item.url.string.strip()) subject = URIRef(url) #TODO: Check that with URL as subject, deduplication is not # needed if args.verbose: print "Processing post " + url if is_duplicate(currentbase,subject) or is_duplicate(g,subject): if args.verbose: print url +" identified as duplicate, skipping..." continue #URL g.add((subject,ns.schema.url,URIRef(url))) #Source g.add((subject,ns.schema.source,Literal("Indeed"))) #Title g.add((subject,ns.schema.jobTitle,Literal(item.jobtitle.string.strip()))) #Description g.add((subject,ns.schema.description,Literal(get_description(url)))) #PubDate date = dtp.parse(item.date.string) g.add((subject,ns.schema.datePosted, Literal(date.isoformat(),datatype=XSD.Date))) #hiringOrganization try: g.add((subject,ns.schema.hiringOrganization, Literal(item.company.string.strip()))) except AttributeError: if args.verbose: print ("%s has no company in the source data" % subject) #location location = item.formattedlocation.string.strip() g.add((subject,ns.schema.jobLocation,Literal(location))) try: #TODO changing point when separating geonames subset from current base if gn.is_inside(location,currentbase): if args.verbose: print ("%s already linked to geonames, reusing..." % location) lociri = gn.get_iri(location,currentbase) g.add((subject,ns.edsa.Location,lociri)) elif gn.is_inside(location,g): if args.verbose: print ("%s already linked to geonames, reusing..." % location) lociri = gn.get_iri(location,g) g.add((subject,ns.edsa.Location,lociri)) else: tup = gn.find_location(item.formattedlocation.string.strip()) g.add((subject,ns.edsa.Location,URIRef(tup[0]))) g += tup[1] except gn.NotFoundException as e: #TODO: Redirect to an error file print("%s in subject %s" % (e,subject)) print("problematic location %s" % item.formattedlocation.string) currentbase += g g.serialize(destination=args.outputfile, format='turtle') currentbase.serialize(destination=args.current, format='turtle')