Exemplo n.º 1
0
 def _process_event(self, record):
     # Get the event id, its name and its last modification date 
     event_name = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', record['Event'], flags=re.IGNORECASE).replace('\n', '')
     event_id = re.search('href="([^"]*)"', record['Event']).group(1)[1:-1]
     event_last_update_date = datetime.strptime(record['Date'], "%d %b %Y")
     event = Event(event_id)
     
     # Get the last modification date from the end point
     server_version_date = self.triple_store.get_last_version_date(event)
     
     # Check the status of the event in the triple store
     if server_version_date == None or (event_last_update_date - server_version_date).days > 0:
         # The event is not in the triple store or needs to be updated
         print '\t[UPD] %s - %s' % (event_id, event_name)
         
         # Add the topics not already existing
         for t in event.get_topics():
             topic = Topic(t)
             if self.triple_store.get_last_version_date(topic) == None:
                 try:
                     print '\t\t[UPD-TOPIC] %s' % t
                     topic.load_data()
                     self.triple_store.save_rdf_data(topic)
                 except:
                     # It's ok if we miss one
                     pass
                 
         # Update the data about all the persons concerned
         for p in event.get_persons():
             try:
                 print '\t\t[UPD-PERSON] %s' % p
                 person = Person(p)
                 person.load_data()
                 self.triple_store.save_rdf_data(person)
             except:
                 # It's ok if we miss one
                 pass
             
         # Save the RDF data of the event
         event.load_data()
         self.triple_store.save_rdf_data(event)
         
         # Save the CFP from the call    
         file = open(self.data_directory + '/' + event.get_resource_name() + '_cfp.txt', 'w')
         file.write(event.get_cfp_data())
         file.close()
         
     else:
         # The server version is up to date
         print '\t[OK] %s - %s' % (event_id, event_name)
Exemplo n.º 2
0
    def _process_data(self, document):
        '''
        Creates the RDF graph describing the topic
        @param document: the DOM document of the topic
        '''
        # Create the graph
        graph = ConjunctiveGraph()
        graph.bind('swc', SWC)
        graph.bind('cfp', CFP)
        graph.bind('ical', ICAL)
        graph.bind('foaf', FOAF)
        graph.bind('dct', DCT)
        graph.bind('lode', LODE)

        # Init the event
        person_resource = LDES[self.get_resource_name()]
        graph.add((person_resource, RDF.type, FOAF['Person']))

        # Get the name
        name = document.find(id='inner_left').find('h1').text
        graph.add((person_resource, FOAF['name'], Literal(name)))

        # Get the center of interests and known persons
        for link in document.find(id='inner_left').find('p').findAll('a'):
            link = link.get('href')
            if link != None and link[:3] == '/t/':
                try:
                    graph.add((person_resource, FOAF['topic_interest'],
                               LDES[Topic(link[1:-1]).get_resource_name()]))
                except:
                    pass
            if link != None and link[:3] == '/p/' and link[
                    1:-1] != self.entity_id:
                try:
                    graph.add((person_resource, FOAF['knows'],
                               LDES[Person(link[1:-1]).get_resource_name()]))
                except:
                    pass

        # Set the last modification date
        graph.add(
            (self.get_named_graph(), DCT['modified'], Literal(datetime.now())))

        # Save the data
        self.rdf_data = graph.serialize()
Exemplo n.º 3
0
    def _process_record(self, record, type):
        '''
        Process a specific record
        '''
        # Compose a name and an ID for the record
        entity_name = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>',
                             record[type]).replace('\n', '')
        entity_url = re.search('href="([^"]*)"', record[type]).group(1)
        entity_id = entity_url.replace('/', '')
        if type == 'Topic':
            entity_id = entity_id.replace('t', 'topic_')
            entity = Topic(entity_name, entity_id)
        if type == 'Event':
            entity_id = entity_id.replace('e', 'event_')
            entity = Event(entity_name, entity_id)
        rdf_file = '%s.rdf' % entity_id
        named_graph_resource = entity.named_graph()

        # Open the file in the container and get last modification date
        last_modification = None
        if os.path.isfile('data/' +
                          rdf_file):  # in self.zip_container.namelist():
            #data = self.zip_container.read(rdf_file)
            g = Graph()
            g.parse('data/' + rdf_file, format='xml')
            for _, _, date in g.triples(
                (named_graph_resource, DCT['modified'], None)):
                last_modification = isodate.parse_datetime(date)

        # If not existent or out dated, generate
        generate = False
        if last_modification == None:
            # If not existent, generate
            print '\t[GEN] %s - %s' % (entity_id, entity_name)
            generate = True
        else:
            delta = datetime.strptime(record['Date'],
                                      "%d %b %Y") - last_modification
            if delta.days > 0:
                # If updated, update
                print '\t[UPD] %s - %s' % (entity_id, entity_name)
                generate = True

        if not generate:
            print '\t[OK] %s - %s' % (entity_id, entity_name)
            return

        # Process a topic
        if type == 'Topic':
            print record
            pass

        # Process an event
        if type == 'Event':
            entity.parse(record, entity_url)
            # Save the CFP
            f = open('data/' + entity_id + '_cfp.txt', 'w')
            f.write(entity.get_cfp_data())
            f.close()

        # Save the RDF data
        f = open('data/' + rdf_file, 'w')
        f.write(entity.get_rdf_data())
        f.close()
Exemplo n.º 4
0
    def _process_record(self, record, type):
        '''
        Process a specific record
        '''
        # Compose a name and an ID for the record
        entity_name = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', record[type]).replace('\n', '')
        entity_url = re.search('href="([^"]*)"', record[type]).group(1) 
        entity_id = entity_url.replace('/', '')
        if type == 'Topic':
            entity_id = entity_id.replace('t', 'topic_')
            entity = Topic(entity_name, entity_id)
        if type == 'Event':
            entity_id = entity_id.replace('e', 'event_')
            entity = Event(entity_name, entity_id)
        rdf_file = '%s.rdf' % entity_id
        named_graph_resource = entity.named_graph()
        
        # Open the file in the container and get last modification date
        last_modification = None
        if os.path.isfile('data/' + rdf_file):# in self.zip_container.namelist():
            #data = self.zip_container.read(rdf_file)
            g = Graph()
            g.parse('data/' + rdf_file, format='xml')
            for _, _, date in g.triples((named_graph_resource, DCT['modified'], None)):
                last_modification = isodate.parse_datetime(date)
                
        # If not existent or out dated, generate
        generate = False
        if last_modification == None:
            # If not existent, generate
            print '\t[GEN] %s - %s' % (entity_id, entity_name)
            generate = True
        else:
            delta = datetime.strptime(record['Date'], "%d %b %Y") - last_modification
            if delta.days > 0:
                # If updated, update
                print '\t[UPD] %s - %s' % (entity_id, entity_name)
                generate = True
                        
        if not generate:
            print '\t[OK] %s - %s' % (entity_id, entity_name)
            return
        
        # Process a topic
        if type == 'Topic':
            print record
            pass
        
        # Process an event
        if type == 'Event':
            entity.parse(record, entity_url)
            # Save the CFP
            f = open('data/' + entity_id + '_cfp.txt', 'w')
            f.write(entity.get_cfp_data())
            f.close()

        # Save the RDF data
        f = open('data/' + rdf_file, 'w')
        f.write(entity.get_rdf_data())
        f.close()
        
        # TODO process the list of cited topics, events and organizations
        
        #self.zip_container.writestr(rdf_file, entity.get_rdf_data())
            
Exemplo n.º 5
0
    def _process_data(self, document):
        '''
        Creates the RDF graph describing the event
        @param document: the DOM document of the event
        '''
        # Create the graph
        graph = ConjunctiveGraph()
        graph.bind('swc', SWC)
        graph.bind('cfp', CFP)
        graph.bind('ical', ICAL)
        graph.bind('foaf', FOAF)
        graph.bind('dct', DCT)
        graph.bind('lode', LODE)

        # Init the event
        resource_event = LDES[self.get_resource_name()]
        graph.add((resource_event, RDF.type, SWC['AcademicEvent']))

        # Get the title
        if document.find(id='inner_left') != None:
            title = document.find(id='inner_left').find('h1').text
            graph.add((resource_event, RDFS.label, Literal(title)))

        # Get the location
        if document.find(text='City:') != None and document.find(
                text='Country:') != None:
            city = document.find(
                text='City:').findParent().findNextSibling().renderContents()
            country = document.find(text='Country:').findParent(
            ).findNextSibling().renderContents()
            location = get_location(city, country)
            if location == None:
                location = Literal("%s, %s" % (city, country))
            graph.add((resource_event, FOAF['based_near'], location))

        # Get the starting and ending dates
        if document.find(text='Period:') != None:
            text = document.find(text='Period:').findParent().findNextSibling(
            ).renderContents()
            parts = re.search(
                '(?P<begin>[^-,]*)(-(?P<end>[^,]*))?, (?P<year>\d{4})',
                text).groupdict()
            if parts['begin'] != None and parts['year'] != None:
                (month, start_day) = parts['begin'].split(' ')
                begin_date = datetime.strptime(
                    "%s %s %s" % (start_day, month, parts['year']), "%d %B %Y")
                graph.add(
                    (resource_event, ICAL['dtstart'], Literal(begin_date)))
                if parts['end'] != None:
                    end_parts = parts['end'].split(' ')
                    end_date = None
                    if len(end_parts) == 2:
                        end_date = datetime.strptime(
                            "%s %s %s" %
                            (end_parts[1], end_parts[0], parts['year']),
                            "%d %B %Y")
                    elif len(end_parts) == 1:
                        end_date = datetime.strptime(
                            "%s %s %s" % (end_parts[0], month, parts['year']),
                            "%d %B %Y")
                    if end_date != None:
                        graph.add(
                            (resource_event, ICAL['dtend'], Literal(end_date)))

        # Get the data for the CFP
        resource_cfp = LDES[self.get_resource_name() + "_cfp"]
        graph.add((resource_cfp, RDF.type, CFP['CallForPapers']))
        graph.add((resource_cfp, CFP['for'], LDES[self.entity_id]))
        graph.add(
            (resource_cfp, CFP['details'],
             URIRef(BASE + 'data/' + self.get_resource_name() + '_cfp.txt')))

        # Get the deadlines
        deadlines = []
        for a in document.findAll('script'):
            res = re.search('var deadlineList = ([^;]*);', a.renderContents())
            if res != None:
                txt = res.group(1).replace('\n',
                                           '').replace('\t',
                                                       '').replace("'", '"')
                txt = re.sub(r'<span [^>]*>([^<]*)</span>',
                             '\g<1>',
                             txt,
                             flags=re.IGNORECASE)
                txt = txt.replace('Date:',
                                  '"Date":').replace('Title:', '"Title":')
                deadlines = json.loads(txt)
        i = 0
        for deadline in deadlines:
            resource_deadline = LDES[self.get_resource_name() + '_deadline_' +
                                     str(i)]
            graph.add((resource_deadline, RDF.type, ICAL['Vevent']))
            graph.add((resource_deadline, ICAL['dtstart'],
                       Literal(datetime.strptime(deadline['Date'],
                                                 "%d %b %Y"))))
            graph.add((resource_deadline, ICAL['dtend'],
                       Literal(datetime.strptime(deadline['Date'],
                                                 "%d %b %Y"))))
            graph.add((resource_deadline, ICAL['summary'],
                       Literal(deadline['Title'])))
            graph.add((resource_deadline, ICAL['relatedTo'], resource_event))
            i = i + 1

        # Add the topics and persons
        if document.find(id='cfp-content') != None:
            for link in document.find(id='cfp-content').findAll('a'):
                link = link.get('href')
                if link != None:
                    if link[:3] == '/t/' and link not in self.topics_set:
                        try:
                            graph.add(
                                (resource_event, DCT['subject'],
                                 LDES[Topic(link[1:-1]).get_resource_name()]))
                            self.topics_set.add(link[1:-1])
                        except:
                            # Ignore bad topic links
                            pass
                    if link[:3] == '/p/' and link not in self.persons_set:
                        try:
                            graph.add(
                                (resource_event, LODE['involvedAgent'],
                                 LDES[Person(link[1:-1]).get_resource_name()]))
                            self.persons_set.add(link[1:-1])
                        except:
                            # Ignore bad person link
                            pass

        # Set the last modification date
        graph.add(
            (self.get_named_graph(), DCT['modified'], Literal(datetime.now())))

        # Save the data
        self.rdf_data = graph.serialize()