Exemplo n.º 1
0
	def _extract_feed(self):
		posts = self.soup.findAll('div', attrs={'class':'post'})

		stories = []
		for p in posts:
			try:
				t = p.find('h1')
				d = p.find('small')
				# Contents are included in parent...

				date = None
				author = None
				contents = ''
				is_summary = False # TODO

				uri = t.a['href']
				title = t.a.text

				# Extract date and author
				for tag in d:
					try:
						if tag.name == 'a' and tag['title'].startswith('Posts'):
							author = tag.text
							continue
					except:
						pass

					try:
						tag = tag.replace('|', '').strip()
						tag = remove_ordinal(tag)
						tag = datetime.strptime(tag, '%B %d, %Y')
						date = tag
					except:
						pass

				# Extract contents
				for t in p:
					try:
						if t.name in ['h1', 'small', 'hr']:
							continue
					except:
						pass
					contents += unicode(t).strip()

				stories.append({
						'uri': uri,
						'title': title,
						'date': date,
						'author': author,
						'contents': contents
					})

			except Exception as e:
				print e
				continue

		return stories
Exemplo n.º 2
0
	def _extract_feed(self):
		posts = self.soup.findAll('div', attrs={'class':'storyblock'})

		stories = []
		for p in posts:
			try:
				h = p.find('h1')
				d = p.find('p', attrs={'class':'storydate'})
				cts = p.find('div', attrs={'class':'story'})

				title = h.a.text
				link = 'http://techdirt.com' + h.a['href']
				date = remove_ordinal(d.text)
				date = datetime.strptime(date, "%a, %b %d %Y %I:%M%p")
				contents = ''
				for t in cts.contents:
					try:
						if t.name in ['h1', 'h3']:
							continue
						if t['style']:
							continue
					except:
						pass
					contents += unicode(t).strip()

				stories.append({
						'uri': link,
						'title': title,
						'date': date,
						'contents': contents
					})

			except Exception as e:
				raise e
				continue

		return stories
Exemplo n.º 3
0
	def _extract_feed(self):
		posts = self.soup.findAll('div', id=re.compile('post-\d+'))

		stories = []
		for p in posts:
			try:
				a = p.find('a', attrs={'rel':re.compile('bookmark')})
				d = p.find('div', attrs={'class':'post-details'})
				cts = p.findAll('p')

				# Lack of semantics is why we can't have nice things...
				date = None
				time = None
				for x in d:
					try:
						x = x.strip()
					except:
						continue

					x = x.replace('|', '').strip()

					# TODO: Really need to write a date parser module
					x = remove_ordinal(x)
					parse_date = None
					parse_time = None
					try:
						parse_date = datetime.strptime(x, "%A, %B %d, %Y")
					except Exception as e:
						pass
					try:
						parse_time = datetime.strptime(x, "%I:%M %p")
					except:
						pass

					if parse_date:
						date = parse_date
					if parse_time:
						time = parse_time

				title = a.text
				link = a.attrMap['href']
				try:
					date = datetime.combine(date, time.time())
				except Exception as e:
					pass #print e

				# extraction of text contents, again *semantics*
				contents = ''
				for t in cts:
					contents += unicode(t).strip()

				stories.append({
						'uri': link,
						'title': title,
						'date': date,
						'contents': contents
					})

			except Exception as e:
				print e
				continue

		return stories