def scrape_dance_cal(keys_from_spreadsheet): """Scrapes dancecal.com. and returns an event instance that includes name, start date, end date, country, city, url, dance styles, teachers, status, key, and an obsolete value""" soup = get_soup(URL_DC) for event_div in soup.findAll('div', {'class' : 'DCListEvent'}): name = None event = Event() for span in event_div.findAll('span'): if 'DCListName' in span['class']: name = span.text.strip() print(name) if name == None: continue elif name.lower() in event_name_list: # checks to see if the event name already exists in the instance list # If it does, it skips it continue else: # This means the event does not already exist in the instance list # and will be added if 'DCListName' in span['class']: event.name = span.text.strip() for a_tag in span.findAll('a', href=True): event.url = a_tag['href'] if 'DCEventInfoDate' in span['class']: event.start_date = parse(span.text) # Now need to guess what the end_date will be since this site does not provide it # I'm going to assume that events will tend to end on a Sunday # For example, if an event starts on a friday, I will make it's end-date two days later. weekday = event.start_date.weekday() gap = datetime.timedelta(days = 6 - weekday) event.end_date = event.start_date + gap if 'DCEventInfoWhere' in span['class']: location_list = span.text.replace(':',',').split(',') if len(location_list) == 3: event.country = location_list[2].strip() event.city = location_list[1].strip() if len(location_list) == 4: event.country = location_list[3].strip() event.state = location_list[2].strip() event.city = location_list[1].strip() if 'DCEventInfoDances' in span['class']: event.dance_styles = span.text.split(': ')[1].lower().strip() if 'DCEventInfoTeachers' in span['class']: event.teachers = str(span).replace('<br/>', '$').replace(':', '$').replace('</i>', '$').replace('|', 'and').split('$')[1:-1] if 'DCEventInfoDesc' in span['class']: event.details = span.text.strip() if 'DCEventInfoBands' in span['class']: event.bands = span.text.split(':')[1].strip() if event.name == None: pass else: event.key = create_key(event) event_list = append_to_event_list(event, event.key, keys_from_spreadsheet) return event_list
def scrape_swing_planit(keys_from_spreadsheet): """Scrapes swingplanit.com. and returns an event instance that includes name, start date, end date, country, city, url, dance styles, teachers, status, key, and an obsolete value""" soup = get_soup(URL_SP) for event_list_item in soup.findAll('li', {'class' : 'color-shape'}): for a_tag in event_list_item.findAll('a', href=True): event_soup = get_soup(a_tag['href']) event = Event() event.name = event_soup.title.text event_name_list.append(event.name.lower()) event.details = event_soup.findAll('p')[0].text event.teachers = event_soup.findAll('p')[2].text # event.teachers = event_soup.findAll('p')[2].text.split(', ') li_tags = event_soup.findAll('li') for li in li_tags: li_text = (li.get_text()) for splitter in SPLITTERS: if splitter in li_text: print(event.name + li_text.split(splitter,1)[0] + ': ' + li_text.split(splitter,1)[1]) if li_text.split(splitter,1)[0].lower() == 'when': date_range = li_text.split(splitter,1)[1].strip() date_range = date_range.split(' - ') event.start_date = parse(date_range[0]) event.end_date = parse(date_range[1]) if li_text.split(splitter,1)[0].lower() == 'country': event.country = li_text.split(splitter,1)[1].strip() if li_text.split(splitter,1)[0].lower() == 'town': event.city = li_text.split(splitter,1)[1].strip() if li_text.split(splitter,1)[0].lower() == 'website': event.url = li_text.split(splitter,1)[1].strip() if li_text.split(splitter,1)[0].lower() == 'styles': event.dance_styles = li_text.split(splitter,1)[1].lower().strip() # event.dance_styles = li_text.split(splitter,1)[1].lower().strip().split(',') event.key = create_key(event) event_list = append_to_event_list(event, event.key, keys_from_spreadsheet) # # pdb.set_trace() # if str(event.key) not in keys_from_spreadsheet: # event_list.append(event) return event_list
def create_event_list(): event_list = [] for row_number, row in enumerate( utils.iter_worksheet(spreadsheet, 'Sheet1', header_row=1)): if row['key'] != '' and row['obsolete'] != '1' and row[ 'status'] != 'past': event = Event() event.key = row['key'] event.name = row['name'] event.start_date = parse(row['start date']) event.end_date = parse(row['end date']) event.city = row['city'] event.state = row['state'] event.country = row['country'] event.dance_styles = row['dance styles'] event.status = row['status'] event.url = row['url'] event.teachers = row['teachers'] event.bands = row['bands'] event.details = row['details'] event.obsolete = row['obsolete'] event.workshop_cost = get_cost(row, 'workshop cost') event.party_pass_cost = get_cost(row, 'party pass cost') event.distance = int(row['distance']) event.flight_cost = int(row['flight cost']) event.event_type = row['type'] if row['currency'] == '': event.currency = 'USD' else: event.currency = row['currency'] if row['driving time'] == '': event.driving_time = 99999 else: event.driving_time = int(row['driving time']) event_list.append(event) return event_list