Exemplo n.º 1
0
def event2voorstelling(event):
    """
        Convert a :class:`icalendar.Event` instance to an internal
        :class:`dedoelen.core.models.Voorstelling` instance.

        :param event: event in the calendar
        :type event: :class:`icalender.Event` 
        :returns: event in De Doelen
        :rtype: :class:`dedoelen.core.models.Voorstelling`
    """
    voorstelling = Voorstelling()
    voorstelling.title = str(event['summary'])
    voorstelling.tstart = event['dtstart'].dt
    voorstelling.tend = event['dtend'].dt
    voorstelling.room = str(event['location'])
    desc = event['description'].to_ical()
    voorstelling.link = desc.split('\n')[-1].split('\\n')[-1]
    if '\\n' in desc:
        voorstelling.description = '\n'.join(desc.split('\\n')[:-1])
    else:
        voorstelling.description = '\n'.join(desc.split('\n')[:-1])
    voorstelling.description = voorstelling.description.replace('\\n','\n')
    voorstelling.description = voorstelling.description.replace('\\,',',')
    voorstelling.description = voorstelling.description.replace('\\;',';')
    voorstelling.sequence = int(event['sequence'])
    return voorstelling
Exemplo n.º 2
0
def html2voorstelling(page):
    """
        Convert a scraped html page to a
        :class:`dedoelen.core.models.Voorstelling` instance. Pages that can not
        be parsed properly are ignored, and None is returned. Text entries are
        converted to ascii, and special characters are ignored. Events that have
        no start time are also ignored (None is returned).

        :param page: tuple of the page url and the page html
        :type page: tuple
        :returns: representation of the event
        :rtype: :class:`dedoelen.core.models.Voorstelling`
    """
    link, html = page
    soup = BeautifulSoup(html, "html.parser")

    if 'Concert niet gevonden' in html:
        logger.warn('Concert niet gevonden: %s' % link)
        return None

    v = Voorstelling()
    v.link = str(link)

    # get title
    title_tags = soup.find_all(attrs={'class': 'title'})
    try:
        v.title = title_tags[0].find_all(attrs={'class': 'main'})[0].text
        v.title = str(v.title)
    except IndexError:
        logger.error('IndexError for link: %s' % link)
    except UnicodeEncodeError:
        v.title = unicodedata.normalize('NFKD', v.title).encode('ascii',
                'ignore')

    # get date
    date = soup.find('dt', text="Datum").parent.findNext("dd").contents[0]
    try:
        date = datetime.strptime(date, "%A %d %B %Y").date()
    except:
        logger.error("Ongeldige datum (%s) voor voorstelling %s" % (date,
            v.title))

    # get start time
    start_tag = soup.find('dt', text='Aanvang')
    if start_tag:
        start_time = start_tag.findNextSiblings("dd")[0].contents[0]
        try:
            start_t = datetime.strptime(start_time, "%H.%M uur").time()
            t = datetime(date.year, date.month, date.day, start_t.hour,
                    start_t.minute)
            v.tstart = settings.AMSTERDAM.localize(t)
        except ValueError:
            logger.error("Ongeldige aanvangstijd (%s) voor voorstelling %s" 
                    % (start_time, v.title))
    
    # get end tag. Correct for events that end after 0:00.
    end_tag = soup.find('dt', text='Eind')
    if end_tag:
        end_time = end_tag.findNextSiblings("dd")[0].contents[0]
        try:
            end_t = datetime.strptime(end_time, "%H.%M uur").time()
            if end_t < start_t:
                newdate = date + timedelta(hours=24)
                t = datetime(newdate.year, newdate.month, newdate.day,
                        end_t.hour, end_t.minute)
                v.tend = settings.AMSTERDAM.localize(t)
            else:
                t = datetime(date.year, date.month, date.day, end_t.hour,
                        end_t.minute)
                v.tend = settings.AMSTERDAM.localize(t)
        except ValueError:
            logger.error("Ongeldige eindtijd (%s) voor voorstelling %s" 
                    % (end_time, v.title))
    else:
        v.tend = v.tstart + timedelta(hours=3)

    room_tag = soup.find("dt", text="Zaal")
    if room_tag:
        v.room = room_tag.findNextSiblings("dd")[0].contents[0]
        v.room = unicodedata.normalize('NFKD', v.room).encode('ascii', 'ignore')
    
    # performers and the event description are both convert to markdown using
    # html2text. They are then combined and converted to ascii.
    prog_tag = soup.find(id="programme")
    perf_tag = soup.find(id="performers")
    desc_tag = soup.find(id="description")
    other_tag = soup.find(id="crossVoorstellingen")
    if prog_tag:
        prog = html2text(prog_tag.encode("ascii"))
    else:
        logger.info("No programme found at link: %s" % link)
        prog = ""
    if perf_tag:
        perf = html2text(perf_tag.encode("ascii"))
    else:
        logger.info("No performers found at link: %s" % link)
        perf = ""
    if desc_tag:
        desc = html2text(desc_tag.encode("ascii").replace('\\n','\n'))
    else:
        logger.info("No description found at link: %s" % link)
        desc = ""
    if other_tag:
        other = html2text(other_tag.encode("ascii"))
    else:
        other = ""

    v.description = '\n'.join([prog, perf, desc, other])
    v.description = unicodedata.normalize('NFKD', v.description).encode('ascii',
            'ignore')

    if v.tstart is None:
        return None
    return v