def process_organization(self): filename = "{0}/data/manual/{1}-organization.csv".format(self.global_config["home"], self.local_config["tag"]) with open(filename) as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: if len(row) < len(headers): # print "skipping row %s" % row continue entry = dict(zip(headers, row)) if len(entry["name"]) == 0: # print "skipping empty name row %s" % entry continue res_organization = self.create_named_entity(self.get_namespace(DataIswc.PREFIX_ORG), entry["name"]) # object properties self.create_triple_complex(res_organization, ["homepage", "logo"], entry) # role self.create_role_to_event( entry["role_event"], entry["role_type"], entry["role_label"], res_organization )
def load_metadata(self): filename_manual = "{0}/data/entity/organisation.csv".format(self.global_config["home"]) if os.path.exists(filename_manual): with open(filename_manual) as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: entry = dict(zip(headers, row)) self.map_name_name[entry["altLabel"]] = {"prefLabel": entry["title"], "dbpediaUri": entry["uri"]} print "{0} name mappings loaded".format(len(self.map_name_name))
def process_person(self): filename = "{0}/data/manual/{1}-person.csv".format(self.global_config["home"], self.local_config["tag"]) with open(filename) as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: if len(row) != len(headers): # print "skipping mismatch row %s" % row continue entry = dict(zip(headers, row)) if len(entry["name"]) == 0: # print "skipping empty name row %s" % entry continue res_person = self.create_named_entity(self.get_namespace(DataIswc.PREFIX_PERSON), entry["name"]) # object properties self.create_triple_complex(res_person, ["homepage"], entry) # role self.create_role_to_event(entry["role_event"], entry["role_type"], entry["role_label"], res_person) # organization if "organization" in entry: for org in entry["organization"].split(";"): if len(org) == 0: continue res_organization = self.create_named_entity(self.get_namespace(DataIswc.PREFIX_ORG), org) self.graph.add((res_organization, FOAF.member, res_person)) # inverse property self.graph.add((res_person, SWRC.affiliation, res_organization)) # alt-name self.create_triple_complex(res_person, ["alt-name"], entry) # email if len(entry["email"]) > 0: if not entry["email"].startswith("mailto:"): mbox = "mailto:%s" % entry["email"] else: mbox = entry["email"] mbox_sha1sum = hashlib.sha1(mbox).hexdigest() # self.graph.add( (res_person, FOAF.mbox, URIRef(mbox)) ) self.graph.add((res_person, FOAF.mbox_sha1sum, Literal(mbox_sha1sum)))
"swws-2001", "iswc-2002", "iswc-2003", "iswc-2004", "iswc-2005", "iswc-2013", ] for tag in list_tag: # process organization table filename = "{0}/data/manual/{1}-organization.csv".format( global_config["home"], tag) with open(filename) as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: if len(row)<len(headers): #print "skipping row %s" % row continue entry = dict(zip(headers, row)) org = entry["name"] if len(org)>0: #print "skipping empty name row %s" % entry continue print u"processing [{0}] in [{1}] ".format(org, tag)
params["filename_manual_csv"] = "{0}/data/manual/{1}.csv".format(global_config['home'], filename_query) params["filename_temp_csv"] = "{0}/local/output/{1}.csv".format(global_config['home'], filename_query) with open(params["filename_query"]) as f: query = f.read() print query command= "curl -H \"Accept: application/sparql-results+json\" \"http://data.semanticweb.org/sparql?query={0}\" > {1}".format(urllib.quote(query), params["filename_result"]) print command #call(command, shell=True) #load manual mapping #name,uri mem_name_uri_mapping ={} if os.path.exists(params["filename_manual_csv"]): with open(params["filename_manual_csv"]) as f: csvreader = UnicodeReader(f) csvreader.next() for row in csvreader: if len(row)<2: continue name = row[0] uri = row[1] mem_name_uri_mapping[name]= uri #write temp csv with open(params["filename_result"]) as f: json_data = json.load(f) counter_name_uri = MyCounterKeyValue() counter_uri_name = MyCounterKeyValue()
def process_proceedings(self): # filename = "{0}/data/manual/full_iswc_proceedings.csv".format( filename = "{0}/data/manual/iswc-publication-proceedings.csv".format(self.global_config["home"]) counter_paper = MyCounter() with open(filename) as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: if len(row) != len(headers): print "skipping mismatch row %s" % row continue entry = dict(zip(headers, row)) if entry["year"] != self.local_config["year"]: # skip mismatched year continue if len(entry["title"]) == 0: print "skipping empty title row %s" % entry continue if len(entry["proceedings_uri"]) == 0: print "skipping empty proceedings_uri row %s" % entry continue uri_proceedings = self.expand_uri(entry["proceedings_uri"]) uri_proceedings_editor_list = "%s/editor_list" % (uri_proceedings) uri_event = self.expand_uri(entry["event_uri"]) # print json.dumps(entry, indent=4) # print uri_proceedings res_proceedings = URIRef(uri_proceedings) res_event = URIRef(uri_event) self.graph.add((res_proceedings, RDF.type, SWRC.Proceedings)) # relation to event self.graph.add((res_proceedings, SWC.relatedToEvent, res_event)) self.graph.add((res_event, SWRC.hasRelatedDocument, res_proceedings)) # editor if len(entry["editor"]) > 0: self.graph.add((res_proceedings, SWRC.listEditor, Literal(entry["editor"]))) list_res_editor = [] for editor in entry["editor"].split(","): res_editor = self.create_named_entity(self.get_namespace(DataIswc.PREFIX_PERSON), editor) list_res_editor.append(res_editor) self.graph.add((res_proceedings, SWRC.editor, res_editor)) self.graph.add((res_proceedings, FOAF.maker, res_editor)) self.graph.add((res_editor, FOAF.made, res_proceedings)) res_proceedings_editor_list = self.create_container( list_res_editor, RDF.Seq, uri_proceedings_editor_list ) self.graph.add((res_proceedings, SWC.editorList, res_proceedings_editor_list)) # simple properties self.create_triple_complex( res_proceedings, [ "title", "subtitle", "abstract", "keywords", "year", "pages", "publisher", "series", "volume", "link_open_access", "link_publisher", "depiction", ], entry, )
def process_paper(self): # filename = "{0}/data/manual/full_iswc_paper_pdf.csv".format( filename = "{0}/data/manual/iswc-publication-paper.csv".format(self.global_config["home"]) counter_paper = MyCounter() with open(filename) as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: if len(row) != len(headers): # print "skipping mismatch row %s" % row continue entry = dict(zip(headers, row)) if entry["year"] != self.local_config["year"]: # skip mismatched year continue if len(entry["title"]) == 0: print "skipping empty title row %s" % entry continue if len(entry["proceedings_uri"]) == 0: print "skipping empty proceedings row %s" % entry continue counter_paper.inc(entry["proceedings_uri"]) id_paper = counter_paper.data[entry["proceedings_uri"]] uri_paper = "%s/paper-%02d" % (entry["proceedings_uri"], id_paper) uri_paper_author_list = "%s/paper-%02d/author_list" % (entry["proceedings_uri"], id_paper) # print json.dumps(entry, indent=4) # print uri_paper res_proceedings = URIRef(entry["proceedings_uri"]) res_paper = URIRef(uri_paper) self.graph.add((res_paper, RDF.type, SWRC.InProceedings)) # part-of proceedings self.graph.add((res_paper, SWC.isPartOf, res_proceedings)) self.graph.add((res_proceedings, SWC.hasPart, res_paper)) # author self.graph.add((res_paper, SWRC.listAuthor, Literal(entry["author"]))) list_res_author = [] for author in entry["author"].split(","): res_author = self.create_named_entity(self.get_namespace(DataIswc.PREFIX_PERSON), author) self.graph.add((res_author, RDF.type, FOAF.Person)) list_res_author.append(res_author) self.graph.add((res_paper, SWRC.author, res_author)) self.graph.add((res_paper, FOAF.maker, res_author)) self.graph.add((res_author, FOAF.made, res_paper)) res_paper_author_list = self.create_container(list_res_author, RDF.Seq, uri_paper_author_list) self.graph.add((res_paper, BIBO.authorList, res_paper_author_list)) # simple properties self.create_triple_complex( res_paper, [ "abstract", "keywords", "year", "pages", "title", "category", "link_open_access", "link_publisher", ], entry, ) # cache self.map_name_res[entry["title"]] = res_paper
def process_event(self): filename = "{0}/data/manual/{1}-event.csv".format(self.global_config["home"], self.local_config["tag"]) counter_event = MyCounter() with open(filename) as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: if len(row) != len(headers): # print "skipping mismatch row %s" % row continue entry = dict(zip(headers, row)) if len(entry["label"]) == 0: # print "skipping empty label row %s" % entry continue if len(entry["event_type"]) == 0: # print "skipping empty event_type row %s" % entry continue if entry["event_uri"].startswith("#"): # print "skipping empty commented row %s" % entry continue # set default super event if len(entry["super_event_uri"]) == 0: entry["super_event_uri"] = "[ME]" uri_super_event = self.expand_uri(entry["super_event_uri"]) res_super_event = URIRef(uri_super_event) if len(entry["event_uri"]) == 0: counter_event.inc(uri_super_event) entry["event_uri"] = "%s/event-%02d" % (uri_super_event, counter_event.data[uri_super_event]) uri_event = self.expand_uri(entry["event_uri"]) res_event = URIRef(uri_event) # event type self.graph.add((res_event, RDF.type, SWC[entry["event_type"]])) # super event self.graph.add((res_event, SWC.isSubEventOf, res_super_event)) self.graph.add((res_super_event, SWC.isSuperEventOf, res_event)) # simple properties self.create_triple_complex( res_event, [ "label", "acronym", "abstract", "order_in_super_event", "start", "end", "tzid", "room", "address", "homepage", "link_document", "logo", ], entry, ) # linking paper event if "TalkEvent" == entry["event_type"]: if entry["label"] in self.map_name_res: res_paper = self.map_name_res[entry["label"]] self.graph.add((res_event, SWC.hasRelatedDocument, res_paper)) self.graph.add((res_paper, SWC.relatedToEvent, res_event)) else: print "missing paper link " + entry["label"] sys.exit(0) # role -chair for role in ["Chair", "Presenter"]: role_lower = role.lower() if len(entry[role_lower + "_person"]) > 0: for name in entry[role_lower + "_person"].split(","): if len(name) == 0: continue res_person = self.create_named_entity(self.get_namespace(DataIswc.PREFIX_PERSON), name) self.create_role_to_event( uri_event, "swc:" + role, entry[role_lower + "_label"], res_person )