Пример #1
0
def events(table=dyn_upload.DEV_EVENTS_TABLE):
    ret = []
    response = requests.get('https://theskint.com/ongoing-events/')

    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find(class_='entry-content')
    entries = content.find_all('p')
    for entry in entries:
        if re.search('[fF]ree | pay-what-you-wish', entry.text):
            if entry.b.text and len(entry.b.text) > 3:

                d = {
                    'name': entry.b.text,
                    'website': entry.a.get('href'),
                    'description': entry.text[:-3],
                    'source': 'The Skint',
                    'rsvp': 'rsvp' in entry.text
                }

                stripped = strip_for_location_description(entry.text)
                description = location_description(stripped)
                if description:
                    d['location_description'] = description
                    d['host'] = description

                if not dyn_upload.is_uploaded(d, table):
                    ret.append(d)

    return ret
Пример #2
0
def events(table=dyn_upload.DEV_EVENTS_TABLE):
    response = eval()
    i = []
    for hit in response.json()['results'][0]['hits']:

        info = {
            'name': hit['title'].replace('<BR>', ' '),
            # TODO: datetime is just set as UTC..
            'dates': [str(datetime.strptime(hit['date'], '%A, %b %d, %Y').date())],
            'times': [str(timeformat(hit['time']))],
            'website': 'https://carnegiehall.org' + hit['url'],
            'photos': [hit['image']['src']],
            'host': 'Carnegie Hall',
            'source': 'Carnegie Hall Citywide',
            'location_description': hit['facility'],
            'rsvp': True # TODO: currently assuming the worst
        }

        more_info_response = requests.get(info['website'])
        soup = BeautifulSoup(more_info_response.text, 'html.parser')

        location_text = soup.find_all(class_='ch-event-facilityNotes')[0].text
        description_text = soup.find_all(class_='ch-page-hero-block__content')[0].text

        location_text = ' '.join(location_text.strip().split('\n')[2:-1])
        description_text = description_text.strip()
        info['description'] = description_text
        if not dyn_upload.is_uploaded(info, table):
            i.append(info)
    return i
Пример #3
0
def events(table=dyn_upload.DEV_EVENTS_TABLE):
    events = []

    cal = BeautifulSoup(requests.get(url).text, 'html.parser')
    for day in cal.find_all('td'):
        for cell in day.find_all('a', class_='calendar_link'):
            cost = cell.find('span', class_='pull-right').text
            if 'free' in cost.lower():
                info = {
                    'dates': [
                        str(
                            parse(
                                day.find('div',
                                         class_='day_title').div.text).date())
                    ],
                    'website':
                    base + cell.get('href'),
                    'location_description':
                    '555 W 42nd Street, New York, NY',
                    'rsvp':
                    False,  # They only sell tickets for charged events, it seems
                    'source':
                    base,
                    'name':
                    cell.find(class_='performance_title').text,
                    'host':
                    'UCB Theatre Hells Kitchen',
                    'types': ['comedy']
                }
                timestr = cell.find('span', class_='pull-left').text
                if 'midnight' in timestr.lower():
                    info['times'] = ['23:59']
                else:
                    info['times'] = [str(parse(timestr).time())]

                site = BeautifulSoup(
                    requests.get(info['website']).text, 'html.parser')
                content = site.find('div', id='content_container').find(
                    'div', class_='clearfix')

                strs = list(content.stripped_strings)
                if 'Cast' in strs:
                    i = len(strs) - strs[::-1].index('Cast')
                    strs = strs[i:]
                elif 'Host' in strs:
                    i = len(strs) - strs[::-1].index('Host')
                    strs = strs[i:]
                else:
                    print('neither "cast" nor "host" in website')
                info['description'] = " ".join(strs)

                # info['description'] = max(list(content.stripped_strings))
                if content.find('img', class_='img-thumbnail'):
                    info['photos'] = [
                        content.find('img', class_='img-thumbnail').get('src')
                    ]

                if not dyn_upload.is_uploaded(info, table):
                    events.append(info)
    return events
def events(table=dyn_upload.DEV_EVENTS_TABLE):
    soup = BeautifulSoup(
        requests.get(base, params=payload).text, 'html.parser')
    events_table = soup.find('table',
                             id='masterPageUC_MPCA399840_ctl06_listTable')

    last_date = str(date.today())
    events = []

    for row in events_table.find_all('tr'):
        if row.find(class_='modCalWeekDayHeader') and row.find(
                class_='modCalWeekDayHeader').text:
            last_date = str(
                parse(row.find(class_='modCalWeekDayHeader').text).date())
        if row.find(class_='modCalWeekRow') and row.find(
                class_='modCalWeekRow').text:
            is_free_event = True
            event = {
                'source': 'The National Arts Club',
                'location_description': '15 Gramercy Park S, New York, NY',
                'host': 'The National Arts Club',
                'rsvp': True,
                'dates': [last_date],
                'times': [],
            }
            for td in row.find_all('td'):
                if td.text.strip():
                    # td is either a time or a description
                    if ' PM' in td.text or ' AM' in td.text:
                        if '-' in td.text:
                            times = td.text.split('-')
                            for t in times:
                                event['times'].append(str(parse(t).time()))
                        else:
                            event['times'] = [str(parse(td.text).time())]
                    else:
                        if not td.a:
                            print(td.text)
                        event['website'] = td.a.get('href')
                        event['name'] = td.a.text

                        # The event page is a mess.
                        event_soup = BeautifulSoup(
                            requests.get(event['website']).text, 'html.parser')
                        event_soup = event_soup.find(
                            'div', id='eventData').find('div', id='eventData')
                        for script in event_soup(['script', 'style']):
                            script.decompose()

                        event_text = event_soup.text.lower()
                        if ' rsvp' not in event_text or ' free ' not in event_text:
                            is_free_event = False
                        if event_soup.img:
                            event['photos'] = [event_soup.img.get('src')]
            if is_free_event and not dyn_upload.is_uploaded(event, table):
                events.append(event)
    return events
def events(table=dyn_upload.DEV_EVENTS_TABLE):
    r = requests.get(api_base + api_page, params=payload).json()
    events = []
    venues = {}
    for venue in r['venues']:
        venues[venue['venueID']] = venue['name']
    for entry in r['events']:
        if entry['cancelled']:
            continue
        event = {
            'name': entry['title'],
            'description': entry['description'],
            'website': base + '/event/' + entry['eventID'],
            'host': 'Flushing Town Hall',
            'rsvp': False,
            'source': base,
        }
        if 'cancel' in event['name'].lower():
            continue
        if 'showbillPath' in entry:
            event['photos'] = [api_base + entry['showbillPath']]

        if 'flushing town hall' in venues[entry['venueID']].lower():
            event[
                'location_description'] = '137-35 Northern Boulevard, Flushing, NY'
        else:
            event['location_description'] = venues[entry['venueID']]
        # TODO: could also look for '.. at ..' in event title
        if 'queens center mall' in event['name'].lower():
            event['location_description'] = 'Queens Center Mall'

        if entry['ticketTypesAvailable']:
            event['rsvp'] = True

        event_soup = BeautifulSoup(
            requests.get(event['website']).text, 'html.parser')
        subheader = event_soup.find(
            'span', itemprop='description').strong.text.split('|')
        subheader = (''.join(subheader[:3]))
        if '-' in subheader:
            am = False
            if 'am' in subheader.lower():
                am = True
            subheader = subheader.split('-')[0]
            if am:
                subheader += 'AM'
            else:
                subheader += 'PM'
        dt = parse(subheader)
        event['dates'] = [str(dt.date())]
        event['times'] = [str(dt.time())]

        if not dyn_upload.is_uploaded(event, table=table):
            events.append(event)
    return events
Пример #6
0
def events(table=dyn_upload.DEV_EVENTS_TABLE):
    ddb_names = dict([(x['name'].lower(), x['dates'])
                      for x in dyn_upload.get_all_items_from_table(table)])
    events = []
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    sections = soup.find_all("div", class_="ai1ec-event")
    for sec in sections:
        if sec.find(class_="ai1ec-read-more"
                    ):  # sometimes doesn't find this? not sure why
            info = {
                'name': sec.find(class_="ai1ec-event-title").text.strip(),
                'website':
                sec.find(class_="ai1ec-read-more").get('href').strip(),
                'location_description': "172 Allen St, New York, NY",
                'host': "Bluestockings",
                'rsvp': True,  # TODO: update scraper for this
                'description': sec.find(class_="ai1ec-event-description").text,
                'source': 'bluestockings.com',
            }
            e_soup = BeautifulSoup(
                requests.get(info['website']).text, 'html.parser')

            # TODO: some events are repeating: we should check if they have the same dates
            info['description'] = ""
            for p in e_soup.find("div", class_="entry-content").find_all("p"):
                info['description'] += p.text + '\n'
            # just doing start time, meh
            e_soup.find()
            d, t = re.search("(.*) @ (.*) –",
                             e_soup.find("div",
                                         class_="dt-duration").text).groups()
            info['dates'] = [str(parse(d).date())]
            info['times'] = [str(parse(t).time())]

            if not dyn_upload.is_uploaded(info, table):
                events.append(info)
    return events