def _process_event(self, record): # Get the event id, its name and its last modification date event_name = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', record['Event'], flags=re.IGNORECASE).replace('\n', '') event_id = re.search('href="([^"]*)"', record['Event']).group(1)[1:-1] event_last_update_date = datetime.strptime(record['Date'], "%d %b %Y") event = Event(event_id) # Get the last modification date from the end point server_version_date = self.triple_store.get_last_version_date(event) # Check the status of the event in the triple store if server_version_date == None or (event_last_update_date - server_version_date).days > 0: # The event is not in the triple store or needs to be updated print '\t[UPD] %s - %s' % (event_id, event_name) # Add the topics not already existing for t in event.get_topics(): topic = Topic(t) if self.triple_store.get_last_version_date(topic) == None: try: print '\t\t[UPD-TOPIC] %s' % t topic.load_data() self.triple_store.save_rdf_data(topic) except: # It's ok if we miss one pass # Update the data about all the persons concerned for p in event.get_persons(): try: print '\t\t[UPD-PERSON] %s' % p person = Person(p) person.load_data() self.triple_store.save_rdf_data(person) except: # It's ok if we miss one pass # Save the RDF data of the event event.load_data() self.triple_store.save_rdf_data(event) # Save the CFP from the call file = open(self.data_directory + '/' + event.get_resource_name() + '_cfp.txt', 'w') file.write(event.get_cfp_data()) file.close() else: # The server version is up to date print '\t[OK] %s - %s' % (event_id, event_name)
def _process_data(self, document): ''' Creates the RDF graph describing the topic @param document: the DOM document of the topic ''' # Create the graph graph = ConjunctiveGraph() graph.bind('swc', SWC) graph.bind('cfp', CFP) graph.bind('ical', ICAL) graph.bind('foaf', FOAF) graph.bind('dct', DCT) graph.bind('lode', LODE) # Init the event person_resource = LDES[self.get_resource_name()] graph.add((person_resource, RDF.type, FOAF['Person'])) # Get the name name = document.find(id='inner_left').find('h1').text graph.add((person_resource, FOAF['name'], Literal(name))) # Get the center of interests and known persons for link in document.find(id='inner_left').find('p').findAll('a'): link = link.get('href') if link != None and link[:3] == '/t/': try: graph.add((person_resource, FOAF['topic_interest'], LDES[Topic(link[1:-1]).get_resource_name()])) except: pass if link != None and link[:3] == '/p/' and link[ 1:-1] != self.entity_id: try: graph.add((person_resource, FOAF['knows'], LDES[Person(link[1:-1]).get_resource_name()])) except: pass # Set the last modification date graph.add( (self.get_named_graph(), DCT['modified'], Literal(datetime.now()))) # Save the data self.rdf_data = graph.serialize()
def _process_record(self, record, type): ''' Process a specific record ''' # Compose a name and an ID for the record entity_name = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', record[type]).replace('\n', '') entity_url = re.search('href="([^"]*)"', record[type]).group(1) entity_id = entity_url.replace('/', '') if type == 'Topic': entity_id = entity_id.replace('t', 'topic_') entity = Topic(entity_name, entity_id) if type == 'Event': entity_id = entity_id.replace('e', 'event_') entity = Event(entity_name, entity_id) rdf_file = '%s.rdf' % entity_id named_graph_resource = entity.named_graph() # Open the file in the container and get last modification date last_modification = None if os.path.isfile('data/' + rdf_file): # in self.zip_container.namelist(): #data = self.zip_container.read(rdf_file) g = Graph() g.parse('data/' + rdf_file, format='xml') for _, _, date in g.triples( (named_graph_resource, DCT['modified'], None)): last_modification = isodate.parse_datetime(date) # If not existent or out dated, generate generate = False if last_modification == None: # If not existent, generate print '\t[GEN] %s - %s' % (entity_id, entity_name) generate = True else: delta = datetime.strptime(record['Date'], "%d %b %Y") - last_modification if delta.days > 0: # If updated, update print '\t[UPD] %s - %s' % (entity_id, entity_name) generate = True if not generate: print '\t[OK] %s - %s' % (entity_id, entity_name) return # Process a topic if type == 'Topic': print record pass # Process an event if type == 'Event': entity.parse(record, entity_url) # Save the CFP f = open('data/' + entity_id + '_cfp.txt', 'w') f.write(entity.get_cfp_data()) f.close() # Save the RDF data f = open('data/' + rdf_file, 'w') f.write(entity.get_rdf_data()) f.close()
def _process_record(self, record, type): ''' Process a specific record ''' # Compose a name and an ID for the record entity_name = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', record[type]).replace('\n', '') entity_url = re.search('href="([^"]*)"', record[type]).group(1) entity_id = entity_url.replace('/', '') if type == 'Topic': entity_id = entity_id.replace('t', 'topic_') entity = Topic(entity_name, entity_id) if type == 'Event': entity_id = entity_id.replace('e', 'event_') entity = Event(entity_name, entity_id) rdf_file = '%s.rdf' % entity_id named_graph_resource = entity.named_graph() # Open the file in the container and get last modification date last_modification = None if os.path.isfile('data/' + rdf_file):# in self.zip_container.namelist(): #data = self.zip_container.read(rdf_file) g = Graph() g.parse('data/' + rdf_file, format='xml') for _, _, date in g.triples((named_graph_resource, DCT['modified'], None)): last_modification = isodate.parse_datetime(date) # If not existent or out dated, generate generate = False if last_modification == None: # If not existent, generate print '\t[GEN] %s - %s' % (entity_id, entity_name) generate = True else: delta = datetime.strptime(record['Date'], "%d %b %Y") - last_modification if delta.days > 0: # If updated, update print '\t[UPD] %s - %s' % (entity_id, entity_name) generate = True if not generate: print '\t[OK] %s - %s' % (entity_id, entity_name) return # Process a topic if type == 'Topic': print record pass # Process an event if type == 'Event': entity.parse(record, entity_url) # Save the CFP f = open('data/' + entity_id + '_cfp.txt', 'w') f.write(entity.get_cfp_data()) f.close() # Save the RDF data f = open('data/' + rdf_file, 'w') f.write(entity.get_rdf_data()) f.close() # TODO process the list of cited topics, events and organizations #self.zip_container.writestr(rdf_file, entity.get_rdf_data())
def _process_data(self, document): ''' Creates the RDF graph describing the event @param document: the DOM document of the event ''' # Create the graph graph = ConjunctiveGraph() graph.bind('swc', SWC) graph.bind('cfp', CFP) graph.bind('ical', ICAL) graph.bind('foaf', FOAF) graph.bind('dct', DCT) graph.bind('lode', LODE) # Init the event resource_event = LDES[self.get_resource_name()] graph.add((resource_event, RDF.type, SWC['AcademicEvent'])) # Get the title if document.find(id='inner_left') != None: title = document.find(id='inner_left').find('h1').text graph.add((resource_event, RDFS.label, Literal(title))) # Get the location if document.find(text='City:') != None and document.find( text='Country:') != None: city = document.find( text='City:').findParent().findNextSibling().renderContents() country = document.find(text='Country:').findParent( ).findNextSibling().renderContents() location = get_location(city, country) if location == None: location = Literal("%s, %s" % (city, country)) graph.add((resource_event, FOAF['based_near'], location)) # Get the starting and ending dates if document.find(text='Period:') != None: text = document.find(text='Period:').findParent().findNextSibling( ).renderContents() parts = re.search( '(?P<begin>[^-,]*)(-(?P<end>[^,]*))?, (?P<year>\d{4})', text).groupdict() if parts['begin'] != None and parts['year'] != None: (month, start_day) = parts['begin'].split(' ') begin_date = datetime.strptime( "%s %s %s" % (start_day, month, parts['year']), "%d %B %Y") graph.add( (resource_event, ICAL['dtstart'], Literal(begin_date))) if parts['end'] != None: end_parts = parts['end'].split(' ') end_date = None if len(end_parts) == 2: end_date = datetime.strptime( "%s %s %s" % (end_parts[1], end_parts[0], parts['year']), "%d %B %Y") elif len(end_parts) == 1: end_date = datetime.strptime( "%s %s %s" % (end_parts[0], month, parts['year']), "%d %B %Y") if end_date != None: graph.add( (resource_event, ICAL['dtend'], Literal(end_date))) # Get the data for the CFP resource_cfp = LDES[self.get_resource_name() + "_cfp"] graph.add((resource_cfp, RDF.type, CFP['CallForPapers'])) graph.add((resource_cfp, CFP['for'], LDES[self.entity_id])) graph.add( (resource_cfp, CFP['details'], URIRef(BASE + 'data/' + self.get_resource_name() + '_cfp.txt'))) # Get the deadlines deadlines = [] for a in document.findAll('script'): res = re.search('var deadlineList = ([^;]*);', a.renderContents()) if res != None: txt = res.group(1).replace('\n', '').replace('\t', '').replace("'", '"') txt = re.sub(r'<span [^>]*>([^<]*)</span>', '\g<1>', txt, flags=re.IGNORECASE) txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":') deadlines = json.loads(txt) i = 0 for deadline in deadlines: resource_deadline = LDES[self.get_resource_name() + '_deadline_' + str(i)] graph.add((resource_deadline, RDF.type, ICAL['Vevent'])) graph.add((resource_deadline, ICAL['dtstart'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['dtend'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['summary'], Literal(deadline['Title']))) graph.add((resource_deadline, ICAL['relatedTo'], resource_event)) i = i + 1 # Add the topics and persons if document.find(id='cfp-content') != None: for link in document.find(id='cfp-content').findAll('a'): link = link.get('href') if link != None: if link[:3] == '/t/' and link not in self.topics_set: try: graph.add( (resource_event, DCT['subject'], LDES[Topic(link[1:-1]).get_resource_name()])) self.topics_set.add(link[1:-1]) except: # Ignore bad topic links pass if link[:3] == '/p/' and link not in self.persons_set: try: graph.add( (resource_event, LODE['involvedAgent'], LDES[Person(link[1:-1]).get_resource_name()])) self.persons_set.add(link[1:-1]) except: # Ignore bad person link pass # Set the last modification date graph.add( (self.get_named_graph(), DCT['modified'], Literal(datetime.now()))) # Save the data self.rdf_data = graph.serialize()