Пример #1
0
def scrape_dance_cal(keys_from_spreadsheet):
	"""Scrapes dancecal.com. and returns an event instance
	that includes name, start date, end date, country, city, url,
	dance styles, teachers, status, key, and an obsolete value"""
	soup = get_soup(URL_DC)
	for event_div in soup.findAll('div', {'class' : 'DCListEvent'}):
		name = None
		event = Event()
		for span in event_div.findAll('span'):

			if 'DCListName' in span['class']:
				name = span.text.strip()
			print(name)
			if name == None:
				continue
			elif name.lower() in event_name_list:
				# checks to see if the event name already exists in the instance list
				# If it does, it skips it
				continue
			else:
				# This means the event does not already exist in the instance list
				# and will be added
				if 'DCListName' in span['class']:
					event.name = span.text.strip()
					for a_tag in span.findAll('a', href=True):
						event.url = a_tag['href']
				if 'DCEventInfoDate' in span['class']:
					event.start_date = parse(span.text)
					# Now need to guess what the end_date will be since this site does not provide it
					# I'm going to assume that events will tend to end on a Sunday
					# For example, if an event starts on a friday, I will make it's end-date two days later. 
					weekday = event.start_date.weekday()
					gap = datetime.timedelta(days = 6 - weekday)
					event.end_date = event.start_date + gap
				if 'DCEventInfoWhere' in span['class']:
					location_list = span.text.replace(':',',').split(',')
					if len(location_list) == 3:
						event.country = location_list[2].strip()
						event.city = location_list[1].strip()
					if len(location_list) == 4:
						event.country = location_list[3].strip()
						event.state = location_list[2].strip()
						event.city = location_list[1].strip()
				if 'DCEventInfoDances' in span['class']:
					event.dance_styles = span.text.split(': ')[1].lower().strip()
				if 'DCEventInfoTeachers' in span['class']:
					event.teachers = str(span).replace('<br/>', '$').replace(':', '$').replace('</i>', '$').replace('|', 'and').split('$')[1:-1]
				if 'DCEventInfoDesc' in span['class']:
					event.details = span.text.strip()
				if 'DCEventInfoBands' in span['class']:
					event.bands = span.text.split(':')[1].strip()
		if event.name == None:
			pass
		else:
			event.key = create_key(event)
			event_list = append_to_event_list(event, event.key, keys_from_spreadsheet)
	return event_list
Пример #2
0
def scrape_swing_planit(keys_from_spreadsheet):
	"""Scrapes swingplanit.com. and returns an event instance
	that includes name, start date, end date, country, city, url,
	dance styles, teachers, status, key, and an obsolete value"""
	soup = get_soup(URL_SP)
	for event_list_item in soup.findAll('li', {'class' : 'color-shape'}):
		for a_tag in event_list_item.findAll('a', href=True):
			event_soup = get_soup(a_tag['href'])
			event = Event()
			event.name = event_soup.title.text
			event_name_list.append(event.name.lower())
			event.details = event_soup.findAll('p')[0].text
			event.teachers = event_soup.findAll('p')[2].text
			# event.teachers = event_soup.findAll('p')[2].text.split(', ')
			li_tags = event_soup.findAll('li')
			for li in li_tags:
				li_text = (li.get_text())
				for splitter in SPLITTERS:
					if splitter in li_text:
						print(event.name + li_text.split(splitter,1)[0] + ': ' + 
							  li_text.split(splitter,1)[1])
						if li_text.split(splitter,1)[0].lower() == 'when':
							date_range = li_text.split(splitter,1)[1].strip()
							date_range = date_range.split(' - ')
							event.start_date = parse(date_range[0])
							event.end_date = parse(date_range[1])
						if li_text.split(splitter,1)[0].lower() == 'country':
							event.country = li_text.split(splitter,1)[1].strip()
						if li_text.split(splitter,1)[0].lower() == 'town':
							event.city = li_text.split(splitter,1)[1].strip()
						if li_text.split(splitter,1)[0].lower() == 'website':
							event.url = li_text.split(splitter,1)[1].strip()
						if li_text.split(splitter,1)[0].lower() == 'styles':
							event.dance_styles = li_text.split(splitter,1)[1].lower().strip()
							# event.dance_styles = li_text.split(splitter,1)[1].lower().strip().split(',')
		event.key = create_key(event)
		event_list = append_to_event_list(event, event.key, keys_from_spreadsheet)
		# # pdb.set_trace()
		# if str(event.key) not in keys_from_spreadsheet:
		# 	event_list.append(event)
	return event_list
Пример #3
0
def create_event_list():
    event_list = []
    for row_number, row in enumerate(
            utils.iter_worksheet(spreadsheet, 'Sheet1', header_row=1)):
        if row['key'] != '' and row['obsolete'] != '1' and row[
                'status'] != 'past':
            event = Event()
            event.key = row['key']
            event.name = row['name']
            event.start_date = parse(row['start date'])
            event.end_date = parse(row['end date'])
            event.city = row['city']
            event.state = row['state']
            event.country = row['country']
            event.dance_styles = row['dance styles']
            event.status = row['status']
            event.url = row['url']
            event.teachers = row['teachers']
            event.bands = row['bands']
            event.details = row['details']
            event.obsolete = row['obsolete']
            event.workshop_cost = get_cost(row, 'workshop cost')
            event.party_pass_cost = get_cost(row, 'party pass cost')
            event.distance = int(row['distance'])
            event.flight_cost = int(row['flight cost'])
            event.event_type = row['type']
            if row['currency'] == '':
                event.currency = 'USD'
            else:
                event.currency = row['currency']
            if row['driving time'] == '':
                event.driving_time = 99999
            else:
                event.driving_time = int(row['driving time'])
            event_list.append(event)
    return event_list