Exemplo n.º 1
0
    def test_get_iri(self):
        testgraph = rdflib.Graph()
        testgraph.parse(TESTFILE, format="turtle")

        location = "Αθήνα"
        iri = gn.get_iri(location, testgraph)
        self.assertTrue(isinstance(iri, rdflib.URIRef))
        realiri = rdflib.URIRef("http://sws.geonames.org/264371/")
        self.assertEqual(iri, realiri)
Exemplo n.º 2
0
def main(argv):
    args = args_process(argv)

    soup = BeautifulSoup(open(args.inputfile),'lxml')

    currentbase = Graph()
    currentbase.load(args.current, format='turtle')

    g = Graph()
    for item in soup.find_all('result'):
        url = shrink_url(item.url.string.strip())
        subject = URIRef(url)
        #TODO: Check that with URL as subject, deduplication is not
        # needed
        if args.verbose:
            print "Processing post " + url
        
        if is_duplicate(currentbase,subject) or is_duplicate(g,subject):
            if args.verbose:
                print url +" identified as duplicate, skipping..."
            continue
        #URL
        g.add((subject,ns.schema.url,URIRef(url)))
        #Source
        g.add((subject,ns.schema.source,Literal("Indeed")))
        #Title
        g.add((subject,ns.schema.jobTitle,Literal(item.jobtitle.string.strip())))
        #Description
        g.add((subject,ns.schema.description,Literal(get_description(url))))
        #PubDate
        date = dtp.parse(item.date.string)
        g.add((subject,ns.schema.datePosted,
            Literal(date.isoformat(),datatype=XSD.Date)))

        #hiringOrganization
        try:
            g.add((subject,ns.schema.hiringOrganization,
                Literal(item.company.string.strip())))
        except AttributeError:
            if args.verbose:
                print ("%s has no company in the source data" % subject)
        #location
        location = item.formattedlocation.string.strip()
        g.add((subject,ns.schema.jobLocation,Literal(location)))
        try:
            #TODO changing point when separating geonames subset from current base
            if gn.is_inside(location,currentbase):
                if args.verbose:
                    print ("%s already linked to geonames, reusing..." % location)
                lociri = gn.get_iri(location,currentbase)
                g.add((subject,ns.edsa.Location,lociri))
            elif gn.is_inside(location,g):
                if args.verbose:
                    print ("%s already linked to geonames, reusing..." % location)
                lociri = gn.get_iri(location,g)
                g.add((subject,ns.edsa.Location,lociri))
            else:
                tup = gn.find_location(item.formattedlocation.string.strip())
                g.add((subject,ns.edsa.Location,URIRef(tup[0])))
                g += tup[1]
        except gn.NotFoundException as e:
            #TODO: Redirect to an error file
            print("%s in subject %s" % (e,subject))
            print("problematic location %s" % item.formattedlocation.string)


    currentbase += g
    g.serialize(destination=args.outputfile, format='turtle')
    currentbase.serialize(destination=args.current, format='turtle')