def events(table=dyn_upload.DEV_EVENTS_TABLE): ret = [] response = requests.get('https://theskint.com/ongoing-events/') soup = BeautifulSoup(response.text, 'html.parser') content = soup.find(class_='entry-content') entries = content.find_all('p') for entry in entries: if re.search('[fF]ree | pay-what-you-wish', entry.text): if entry.b.text and len(entry.b.text) > 3: d = { 'name': entry.b.text, 'website': entry.a.get('href'), 'description': entry.text[:-3], 'source': 'The Skint', 'rsvp': 'rsvp' in entry.text } stripped = strip_for_location_description(entry.text) description = location_description(stripped) if description: d['location_description'] = description d['host'] = description if not dyn_upload.is_uploaded(d, table): ret.append(d) return ret
def events(table=dyn_upload.DEV_EVENTS_TABLE): response = eval() i = [] for hit in response.json()['results'][0]['hits']: info = { 'name': hit['title'].replace('<BR>', ' '), # TODO: datetime is just set as UTC.. 'dates': [str(datetime.strptime(hit['date'], '%A, %b %d, %Y').date())], 'times': [str(timeformat(hit['time']))], 'website': 'https://carnegiehall.org' + hit['url'], 'photos': [hit['image']['src']], 'host': 'Carnegie Hall', 'source': 'Carnegie Hall Citywide', 'location_description': hit['facility'], 'rsvp': True # TODO: currently assuming the worst } more_info_response = requests.get(info['website']) soup = BeautifulSoup(more_info_response.text, 'html.parser') location_text = soup.find_all(class_='ch-event-facilityNotes')[0].text description_text = soup.find_all(class_='ch-page-hero-block__content')[0].text location_text = ' '.join(location_text.strip().split('\n')[2:-1]) description_text = description_text.strip() info['description'] = description_text if not dyn_upload.is_uploaded(info, table): i.append(info) return i
def events(table=dyn_upload.DEV_EVENTS_TABLE): events = [] cal = BeautifulSoup(requests.get(url).text, 'html.parser') for day in cal.find_all('td'): for cell in day.find_all('a', class_='calendar_link'): cost = cell.find('span', class_='pull-right').text if 'free' in cost.lower(): info = { 'dates': [ str( parse( day.find('div', class_='day_title').div.text).date()) ], 'website': base + cell.get('href'), 'location_description': '555 W 42nd Street, New York, NY', 'rsvp': False, # They only sell tickets for charged events, it seems 'source': base, 'name': cell.find(class_='performance_title').text, 'host': 'UCB Theatre Hells Kitchen', 'types': ['comedy'] } timestr = cell.find('span', class_='pull-left').text if 'midnight' in timestr.lower(): info['times'] = ['23:59'] else: info['times'] = [str(parse(timestr).time())] site = BeautifulSoup( requests.get(info['website']).text, 'html.parser') content = site.find('div', id='content_container').find( 'div', class_='clearfix') strs = list(content.stripped_strings) if 'Cast' in strs: i = len(strs) - strs[::-1].index('Cast') strs = strs[i:] elif 'Host' in strs: i = len(strs) - strs[::-1].index('Host') strs = strs[i:] else: print('neither "cast" nor "host" in website') info['description'] = " ".join(strs) # info['description'] = max(list(content.stripped_strings)) if content.find('img', class_='img-thumbnail'): info['photos'] = [ content.find('img', class_='img-thumbnail').get('src') ] if not dyn_upload.is_uploaded(info, table): events.append(info) return events
def events(table=dyn_upload.DEV_EVENTS_TABLE): soup = BeautifulSoup( requests.get(base, params=payload).text, 'html.parser') events_table = soup.find('table', id='masterPageUC_MPCA399840_ctl06_listTable') last_date = str(date.today()) events = [] for row in events_table.find_all('tr'): if row.find(class_='modCalWeekDayHeader') and row.find( class_='modCalWeekDayHeader').text: last_date = str( parse(row.find(class_='modCalWeekDayHeader').text).date()) if row.find(class_='modCalWeekRow') and row.find( class_='modCalWeekRow').text: is_free_event = True event = { 'source': 'The National Arts Club', 'location_description': '15 Gramercy Park S, New York, NY', 'host': 'The National Arts Club', 'rsvp': True, 'dates': [last_date], 'times': [], } for td in row.find_all('td'): if td.text.strip(): # td is either a time or a description if ' PM' in td.text or ' AM' in td.text: if '-' in td.text: times = td.text.split('-') for t in times: event['times'].append(str(parse(t).time())) else: event['times'] = [str(parse(td.text).time())] else: if not td.a: print(td.text) event['website'] = td.a.get('href') event['name'] = td.a.text # The event page is a mess. event_soup = BeautifulSoup( requests.get(event['website']).text, 'html.parser') event_soup = event_soup.find( 'div', id='eventData').find('div', id='eventData') for script in event_soup(['script', 'style']): script.decompose() event_text = event_soup.text.lower() if ' rsvp' not in event_text or ' free ' not in event_text: is_free_event = False if event_soup.img: event['photos'] = [event_soup.img.get('src')] if is_free_event and not dyn_upload.is_uploaded(event, table): events.append(event) return events
def events(table=dyn_upload.DEV_EVENTS_TABLE): r = requests.get(api_base + api_page, params=payload).json() events = [] venues = {} for venue in r['venues']: venues[venue['venueID']] = venue['name'] for entry in r['events']: if entry['cancelled']: continue event = { 'name': entry['title'], 'description': entry['description'], 'website': base + '/event/' + entry['eventID'], 'host': 'Flushing Town Hall', 'rsvp': False, 'source': base, } if 'cancel' in event['name'].lower(): continue if 'showbillPath' in entry: event['photos'] = [api_base + entry['showbillPath']] if 'flushing town hall' in venues[entry['venueID']].lower(): event[ 'location_description'] = '137-35 Northern Boulevard, Flushing, NY' else: event['location_description'] = venues[entry['venueID']] # TODO: could also look for '.. at ..' in event title if 'queens center mall' in event['name'].lower(): event['location_description'] = 'Queens Center Mall' if entry['ticketTypesAvailable']: event['rsvp'] = True event_soup = BeautifulSoup( requests.get(event['website']).text, 'html.parser') subheader = event_soup.find( 'span', itemprop='description').strong.text.split('|') subheader = (''.join(subheader[:3])) if '-' in subheader: am = False if 'am' in subheader.lower(): am = True subheader = subheader.split('-')[0] if am: subheader += 'AM' else: subheader += 'PM' dt = parse(subheader) event['dates'] = [str(dt.date())] event['times'] = [str(dt.time())] if not dyn_upload.is_uploaded(event, table=table): events.append(event) return events
def events(table=dyn_upload.DEV_EVENTS_TABLE): ddb_names = dict([(x['name'].lower(), x['dates']) for x in dyn_upload.get_all_items_from_table(table)]) events = [] soup = BeautifulSoup(requests.get(url).text, 'html.parser') sections = soup.find_all("div", class_="ai1ec-event") for sec in sections: if sec.find(class_="ai1ec-read-more" ): # sometimes doesn't find this? not sure why info = { 'name': sec.find(class_="ai1ec-event-title").text.strip(), 'website': sec.find(class_="ai1ec-read-more").get('href').strip(), 'location_description': "172 Allen St, New York, NY", 'host': "Bluestockings", 'rsvp': True, # TODO: update scraper for this 'description': sec.find(class_="ai1ec-event-description").text, 'source': 'bluestockings.com', } e_soup = BeautifulSoup( requests.get(info['website']).text, 'html.parser') # TODO: some events are repeating: we should check if they have the same dates info['description'] = "" for p in e_soup.find("div", class_="entry-content").find_all("p"): info['description'] += p.text + '\n' # just doing start time, meh e_soup.find() d, t = re.search("(.*) @ (.*) –", e_soup.find("div", class_="dt-duration").text).groups() info['dates'] = [str(parse(d).date())] info['times'] = [str(parse(t).time())] if not dyn_upload.is_uploaded(info, table): events.append(info) return events