Exemplo n.º 1
0
def parse_obituary(url,category):
	"""
	Extracts the necessary information from a single obituary page
	"""
	page = requests.get(url)
	soup = Soup(page.text)
	try:
		date = select(soup, 'p strong')[0].contents[0]
		date = date[date.rfind('died ')+5:].strip()
		cal = pdt.Calendar()
		print >> sys.stderr, 'parsing',date
		date = cal.parseDateText(date)
	except:
		print >> sys.stderr, 'failed to parse'
		return
	date = str('%s/%s/%s' % (date[2],date[1],date[0]))
	publisher = 'Telegraph'
	type = 'obituaries'
	name = select(soup, '.storyHead h1')[0].contents[0]
	content = ''
	for para in select(soup, '#mainBodyArea p'):
		if len(para.contents) > 0:
			content = content + para.contents[0]

	content = content.strip().replace('"','\'')		
	content = content.strip().replace('\n','')
	
	print >> sys.stdout, '%s,%s,%s,%s,"%s","%s"' % (date.encode("UTF-8"),
													publisher.encode("UTF-8"),
													type.encode("UTF-8"),
													name.encode("UTF-8"),
													content.encode("UTF-8"),
													category.encode("UTF-8"))
Exemplo n.º 2
0
def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime
Exemplo n.º 3
0
	def expandDocument(self,header,content,config=None):
		raise "obsolete"
		part = self.partDocument(header["document"],config)
		soup = part.expandSoup(content)
		header = part.get_collapsed_header(header=header)
		stateful_doc = "stateful" in header and header["stateful"] is True

		if stateful_doc:
			script = part.statefulConfigScript()
			if script:
				script_tag = soup.new_tag("script")
				script_tag["type"] = "application/config"
				script_tag.string = script
				soup.body.append(script_tag)

		# fill in meta tags
		self._applyMetaAndTitle(soup,header,config)

		if config["appcache"] == False:
			for h in select(soup,"html"):
				del h["manifest"]
		elif "manifest" in header:
			for h in select(soup,"html"):
				h["manifest"] = header["manifest"]

		if "Content-Language" in header:
			for h in select(soup,"html"):
				h["lang"] = header["Content-Language"]

		# offline markers
		lists = {
			"offline": self._getOfflineList(soup,header),
		}

		return soup.prettify(), lists
Exemplo n.º 4
0
def Loop_Through_Messages(i): #i = start ID - 1
    
    while i < MaxMSG:
        i += 1
        
        Humanize(2) #Humanize the program by sleeping 0-2 seconds
        
        try:
            soup = Make_Soup("http://groups.yahoo.com/group/freecycledc/message/" + str(i))

            MSG_Title = select(soup, 'title')[0].text.replace('\n', '~n-break~')

            msgbodyhtml = select(soup, '.msgarea')[0]
            MSG_Body = unicode.join(u' ',map(unicode,msgbodyhtml)).replace('<br />', '~break~').replace('\n', '~n-break~')
            
            if MSG_Title == '': MSG_Title = '(none)'
            if MSG_Body == '': MSG_Body = '(none)'
            
            Message_Data_to_Table(i, MSG_Title, MSG_Body)
            
            print i, "of", MaxMSG
        except:
            print "ERROR: SCRAPE FAIL ON POSTING ID", i
            
            Check_Column("Title", MSG_Title)
            Check_Column("Body HTML", msgbodyhtml)
            Check_Column("Body Text", MSG_Body)
            
            if MSG_Title == 'freecycledc' or 'message' not in MSG_Title.lower():
                Message_Data_to_Table(i, 'Message does not exist', 'NOTHING TO SEE HERE, FOLKS')
            else:
                Message_Data_to_Table(i, 'FAIL', 'FAIL')
Exemplo n.º 5
0
 def getClassTable(self):
     br = mechanize.Browser()
     br.set_handle_robots(False)
     #directly open the U of I login page
     br.open(
         "https://eas.admin.uillinois.edu/eas/servlet/EasLogin?redirect=https://webprod.admin.uillinois.edu/ssa/servlet/SelfServiceLogin?appName=edu.uillinois.aits.SelfServiceLogin&dad=BANPROD1"
     )
     br.select_form(name="easForm")
     br["inputEnterpriseId"] = self.username  #self.username
     br["password"] = self.password  #self.password
     br.submit()
     br.open(
         "https://ui2web1.apps.uillinois.edu/BANPROD1/bwskcrse.P_CrseSchdDetl"
     )
     try:
         br.select_form(nr=1)
     except:
         return None
     resp = br.submit()
     soup = BeautifulSoup(resp.read())
     br.close()
     sem_info_row = BeautifulSoup(
         str(select(soup, "div.pagetitlediv table tr td")[2]))
     #get course metadata and append it to courses data
     course_sch_table = BeautifulSoup(
         str(select(soup, "div.pagebodydiv table")[-2]))
     courses = self.parseSchTable(course_sch_table)
     return courses
Exemplo n.º 6
0
    def sees_an_element(self, doc, element=None, css_class=None, id=None, css_selector=None):
        """ Tests for the presence of a specified element on the current page...

        self.alice.sees_an_element(doc, id="element_id")
        self.alice.sees_an_element(doc, "element")
        self.alice.sees_an_element(doc, "div", "element_css_class")
        self.alice.sees_an_element(doc, selector="#myid element.bar")
        """
        selector = "any"
        if id:
            displayed_element = doc.find(id=id)
            selector = id
        elif css_selector:
            displayed_elements = select(doc, css_selector)
            displayed_element = displayed_elements[0] if displayed_elements else None
            selector = css_selector
        else:
            if css_class:
                selector = "%s.%s" % (element, css_class)
                displayed_element = select(doc, selector)
            else:
                displayed_element = doc.find(element)
                selector = element
        self.failUnless(displayed_element, "Could not find %s" % (selector))
        return displayed_element
def fetch_review_counts(appid):
    class FetchError(StandardError):
        pass

    url = 'http://store.steampowered.com/app/%i/' % appid
    request = urllib.urlopen(url)
    if request.code < 200 or request.code > 299:
        raise FetchError('Unable to fetch %s' % url, { 'appid': appid, 'status': request.code})

    soup = BeautifulSoup(request)

    positive_count = ''
    positive_count_elements = select(soup, '#ReviewsTab_positive .user_reviews_count')
    if len(positive_count_elements) > 0:
        positive_count = get_count(positive_count_elements[0])

    if not positive_count:
        print >>sys.stderr, "Warning: Unable to find positive user review count on page %s" % url

    negative_count = ''
    negative_count_elements = select(soup, '#ReviewsTab_negative .user_reviews_count')
    if len(negative_count_elements) > 0:
        negative_count = get_count(negative_count_elements[0])

    if not negative_count:
        print >>sys.stderr, "Warning: Unable to find negative user review count on page %s" % url

    return positive_count, negative_count
Exemplo n.º 8
0
def scrapeBlog(blog):
	global completed
	blogurl = blog['postUrl']
	blogData = {}
	try:
		soup = Soup(urllib2.urlopen(blogurl))
		post = select(soup, 'div.post-body')

		title = select(soup, 'h1.title')
		titleNoTags = Soup(str(title))
		rawTitle = ''.join(filter(visible, titleNoTags.findAll(text=True))).strip()
		#print rawTitle

		noScript = Soup(str(post))
		rawText = ''.join(filter(visible, noScript.findAll(text=True))).strip()
		#print raw_text

		blogData['source'] = str(rawTitle)
		blogData['title'] = blog['titleNoFormatting']
		blogData['content'] = str(rawText)
		blogData['date'] = blog['publishedDate']
		blogData['url'] = str(blogurl)

	except e:
		pass
	with dataLock:
		data.append(blogData)
		completed += 1
Exemplo n.º 9
0
  def get_raw_boxscore_data(self, boxscore_soup):
    # Load boxscore data. No logic here, just splitting from HTML into more
    # processable data.
    boxscore_data = []
    boxscore_rows = select(boxscore_soup, '#my-players-table tbody tr')
    for player_data in boxscore_rows:
      cells = select(player_data, 'td')
      if len(cells) == 13:
        # This order should match the boxscore table on espn
        (player_name, minutes, fgma, tpma, ftma, oreb, reb, ast, stl, blk,
            to, pf, pts) = [
          cell.text for cell in cells
        ]

        if not player_name:
          continue

        fgm, fga = fgma.split('-')
        tpm, tpa = tpma.split('-')
        ftm, fta = ftma.split('-')

        (minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
            pf, pts) = map(int, [
          minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
              pf, pts
        ])

        boxscore_data.append({
          'name': player_name, 'minutes': minutes, 'fgm': fgm, 'fga': fga,
          'tpm': tpm, 'tpa': tpa, 'ftm': ftm, 'fta': fta,
          'oreb': oreb, 'reb': reb,
          'ast': ast, 'stl': stl, 'blk': blk, 'to': to, 'pf': pf, 'pts': pts,
        })

    return boxscore_data
Exemplo n.º 10
0
	def _extract_predictions(self, html):
		if '<p class="predictHead"><nobr><span id=\'i18n_en\'>No current prediction' in html:
			return None
		else:
			predictions = []
			soup = BeautifulSoup(html)	

			# get the primary/imminent prediction		
			try:
				minutes = self._clean_prediction_html(select(soup, '.predictionNumberForFirstPred')[0])
			except:
				return None
			if ('departing' in minutes.lower()) or ('arriving' in minutes.lower()):
				predictions.append(0)
			else:
				predictions.append(int(minutes))

			# get the other predictions
			for m in select(soup, '.predictionNumberForOtherPreds'):
				m = self._clean_prediction_html(m)
				try:
					predictions.append(int(m))
				except:
					pass

			return predictions
Exemplo n.º 11
0
Arquivo: app.py Projeto: dahabit/scrap
def extractPage(url, pagination=True):
    print 'Extracting : %s' % url
    result = []
    page = request(url)
    soup = BeautifulSoup(page)
    info = select(soup, '.courseInfo')
    for record in info:
        courseNumber = record.find('span', {'class': 'courseNumber'}).text
        courseTitle = record.find('span', {'class': 'courseTitle'}).text
        courseAttrs = record.find('div', {'class': 'courseAttributes'}).text
        terms = [x for x in courseAttrs.split('|') if 'terms' in x.lower()] 
        if terms:
            courseTime = str(terms[0].split(':')[1]).strip()
        else:
            courseTime = "not given this year"

        obj = {
                'title': courseTitle,
                'number': courseNumber,
                'time': courseTime
                }
        result.append(obj)

    subresults = []
    if pagination:
        pages = select(soup, '#pagination a')
        pagesLinks = href(pages)
        for l in set(pagesLinks):
            subresults.extend(extractPage(BASE + l, False))
    if subresults:
        result.extend(subresults) 
    return result
Exemplo n.º 12
0
	def expand(self,header,content,markup=None,config=None):
		"""
		General header/content expansion replacing expandDocument and expandScss
		"""
		lists = {
			"offline": [],
		}

		if "charset" not in header and markup is not None:
			header["charset"] = config["charset"]
		parent_doc = None
		if "document" in header:
			parent_doc = self.partDocument(header["document"],config)
			header = parent_doc.get_collapsed_header(header=header)

		if markup == "scss":
			content = self.expandScss(header,content,config=config)
		elif markup in ("text","xml"):
			pass #TODO consider what to do
		elif markup == "html":
			soup = None
			if parent_doc:
				soup = parent_doc.expandSoup(content)
			else:
				soup = BeautifulSoup(content,"html5lib")

			if "lang" in header:
				pass #TODO mark html element

			# print soup.head
			stateful_doc = "stateful" in header and header["stateful"] is True

			if stateful_doc:
				script = parent_doc.statefulConfigScript()
				if script:
					script_tag = soup.new_tag("script")
					script_tag["type"] = "application/config"
					script_tag.string = script
					soup.body.append(script_tag)

			# fill in meta tags
			self._applyMetaAndTitle(soup,header,config)

			if config["appcache"] == False:
				for h in select(soup,"html"):
					del h["manifest"]
			elif "manifest" in header:
				for h in select(soup,"html"):
					h["manifest"] = header["manifest"]

			if "Content-Language" in header:
				for h in select(soup,"html"):
					h["lang"] = header["Content-Language"]

			# offline markers
			lists["offline"] = self._getOfflineList(soup,header)
			content = soup.encode()

		return header, content, lists
Exemplo n.º 13
0
def get_info_from_title(soup, name):
    stats = select(soup, "dt.stat-title")

    for stat in stats:

        stat_name = select(stat, "span.title")
        if stat_name:
            if stat_name[0].text == name:
                return select(stat, "span.stat-point")[0].text
Exemplo n.º 14
0
def get_games(page=1):
    def select_first(soup, selector):
        result = select(soup, selector)
        if result and len(result) > 0:
            return result[0]
        else:
            return None

    def inner_text(soup):
        if isinstance(soup, NavigableString):
            return unicode(soup)
        elif soup.contents:
            return u"".join(inner_text(c) for c in soup.contents)
        else:
            return unicode(soup)

    result = []

    soup = BeautifulSoup(urllib.urlopen(search_result_url(page)))
    games = select(soup, "a.search_result_row")
    for game in games:
        href = str(game["href"])
        if re.search("http://store.steampowered.com/app/(\\d+)/", href):
            id = re.search("http://store.steampowered.com/app/(\\d+)/", href).group(1)
        else:
            logging.error("Error extracting ID, skipping")
            continue
        name = inner_text(select(game, "h4")[0])
        price = select_first(game, ".search_price")
        if price and price.contents:
            price = price.contents[-1].lower()

            if price.find("free") != -1:
                price = float(0)
            elif price.startswith("&#36;"):
                # Grab the last node, which is either the price or the "reduced
                # price"
                try:
                    price = float(price[5:])
                except:
                    logging.error("Price conversion error for %s: '%s'" % (name, price))
                    price = None
            else:
                price = None
                logging.error("Price parse error for %s: '%s'" % (name, price))
        else:
            price = None

        metascore = select_first(game, ".search_metascore")
        if metascore and metascore.string:
            metascore = int(metascore.string)
        else:
            metascore = None

        result.append(Game(id=id, name=name, price=price, metascore=metascore))

    return result
Exemplo n.º 15
0
def raw_events(file):
    match = open(file, 'r')
    soup = BeautifulSoup(match.read())
    events = select(soup, 'div#live-text-commentary-wrapper div#live-text')
    more_events = select(soup, 'div#live-text-commentary-wrapper div#more-live-text')
    for event in events + more_events:
        for child in event.children:
            if type(child) is bs4.element.Tag:
                yield child.getText().strip()
Exemplo n.º 16
0
 def get_resources(self, doc):
     resources = []
     for a in select(doc, 'a'):
         url = a.get('href')
         img = select(a, 'img[src]')[0]
         src = img.get('src')
         f_type = REG_URL_FILE.search(src).group(1).lower()
         resources.append((url, f_type))
     return resources
Exemplo n.º 17
0
def get_info_from_description(soup, desc_name):
    stats = select(soup, "dd.stat-description")

    for stat in stats:

        stat_name_decs = select(stat, "li")
        if stat_name_decs:
            for stat_name in stat_name_decs:
                if select(stat_name, "span.title")[0].text == desc_name:
                    return select(stat_name, "span.stat-point")[0].text
Exemplo n.º 18
0
 def parse(self):
     if not self.soup:
         return
     out = []
     for tr in select(self.soup, '#content table tr'):
         td = select(tr, 'td')
         if len(td) != 3:
             continue
         name = select(td[1], 'strong')[0].string
         msg = urlizetrunc(striptags(select(td[2], 'div')[0].renderContents()), 30)
         out.append((name, msg))
     self.data = out[:]
Exemplo n.º 19
0
def find_footnotes_and_anchors(soup):
    selector = '.sdfootnoteanc'
    footnote_anchors = select(soup, selector)
    #print '\n'.join([str(anc) for anc in footnote_anchors])

    footnotes = []
    for i in range(len(footnote_anchors)):
        selector = '#sdfootnote%s' % (i+1)
        footnotes.extend(select(soup, selector))
    #print '\n'.join([str(f) for f in footnotes])

    return footnote_anchors, footnotes
Exemplo n.º 20
0
def find_footnotes_and_anchors(soup):
    selector = '.sdfootnoteanc'
    footnote_anchors = select(soup, selector)
    #print '\n'.join([str(anc) for anc in footnote_anchors])

    footnotes = []
    for i in range(len(footnote_anchors)):
        selector = '#sdfootnote%s' % (i + 1)
        footnotes.extend(select(soup, selector))
    #print '\n'.join([str(f) for f in footnotes])

    return footnote_anchors, footnotes
Exemplo n.º 21
0
def getLinks(cat, sponsor=True):
    _links = []
    r = s.get(cat)
    soup = soupify(r)
    table = select(soup, 'table.categories')[0] if page != 1 or sponsor==False else select(soup, 'table.categories')[1]

    tr = select(table, 'tr')
    for t in tr:
        link = select(t, 'h3 a')
        if link:
            _links.append(str(dict(link[0].attrs)['href']))

    return _links
Exemplo n.º 22
0
def process(d, i=None):
    ''' function to process one entry of the table '''
    # to keep a small idea if this is still working (output)
    if i:
        print '%s' % i
    else:
        print '.'

    # extraction of the link of interest
    link = d['penalty_notice_link']

    # if we havn't downloaded the link yet, we do it and keep in into a html file into the temp folder
    if not os.path.exists('./temp/%s.html' % hash(link)):
        r = requests.get(link)
        with open('./temp/%s.html' % hash(link), 'w') as h:
            h.write(r.text.encode('utf-8'))

    # load the hmtl markup
    with open('./temp/%s.html' % hash(link), 'r') as h:
        source = h.read()

    # if we havnt previously extracted the info, we do it now
    if not os.path.exists('./temp/%s.pickle' % hash(link)):

        # to extract info it's usually the same way:
        #   - use BeautifulSoup to create the soup of the source
        #   - use select and some css classes/ids to extract info
        # => it's exaclty what is down below

        soup = BeautifulSoup(source)
        div = select(soup, 'div.cim_content')[0]
        table = select(div, 'table')[0]
        rows = select(table, 'tr')

        address = str(select(rows[2], 'td')[-1].contents[0])
        offence_code = str(select(rows[5], 'td')[-1].contents[0])
        nature = str(select(rows[6], 'td')[-1].contents[0])
        amount = str(select(rows[7], 'td')[-1].contents[0])
        data_penalty = str(select(rows[9], 'td')[-1].contents[0])
        issued_by = str(select(rows[10], 'td')[-1].contents[0])

        d['address'] = address
        d['offence_code'] = offence_code
        d['nature'] = nature
        d['amount'] = amount
        d['data_penalty'] = data_penalty
        d['issued_by'] = issued_by

        with open('./temp/%s.pickle' % hash(link), 'w') as h:
            pickle.dump(d, h)
    else:
        # we have previously extracted the info, we simply load it avoiding extra work
        with open('./temp/%s.pickle' % hash(link), 'r') as h:
            d = pickle.load(h)

    return d
Exemplo n.º 23
0
def mine_city_weather(city_name, airport_name, filename_to_save):
    with open(filename_to_save) as data_file:
        weather_storage = json.load(data_file)

    for single_date in daterange(start_date, end_date):
        dict_key = single_date.strftime("%Y-%m-%d")

        year = single_date.strftime("%Y")
        month = single_date.strftime("%m")
        day = single_date.strftime("%d")
        if (dict_key not in weather_storage.keys()):
            print dict_key

            soup = Soup(
                urllib2.urlopen(
                    'https://www.wunderground.com/history/airport/' +
                    airport_name + '/' + year + '/' + month + '/' + day +
                    '/DailyHistory.html?req_city=' + city_name +
                    '&req_statename=Denmark'))
            tekst = str(select(soup, '#observations_details'))
            column_counter = 0
            weather_conditions = []
            for column in select(soup, '#obsTable tbody tr'):
                #print column.text.split(';')[-1]
                time_clock = column.text.split(';')[0].split(' ')[0].split(
                    ':')[0]
                time_clock = int(time_clock)
                am_pm = column.text.split(';')[0].split(' ')[1]
                if ('AM' in am_pm):
                    am_pm = 'AM'
                else:
                    am_pm = 'PM'

                if (am_pm == 'AM' and time_clock > 6 and time_clock != 12):
                    weather_conditions.append(column.text.split(';')[-1])
                elif (am_pm == 'PM' and time_clock <= 10):
                    weather_conditions.append(column.text.split(';')[-1])
                #if(column_counter % 13 == 12):
                #print '-------------------'
                #    print column.text
                #    weather_conditions.append(column.text)
                #print '-------------------'
                #column_counter += 1

            weather_storage[dict_key] = weather_conditions

            time.sleep(1)
            with open(filename_to_save, 'w') as outfile:
                json.dump(weather_storage, outfile)
def html_cleanup(html, remove_list = (), encoding=None, log=False):
	"""
	Returns (str cleaned_html, bool changes)
	``remove_list``: is list of selectors, currently supported only attribute and class selectors,
	e.g. ['p.[lang]', u'p.список-western', '[orphaned-attribute]', '.orphaned-class-name']
	``encoding`` is html encoding, autodetected if not passed
	"""

	soup = BeautifulSoup(html, fromEncoding=encoding)

	changes = False

	for selector in remove_list:
		m = REC_ATTR.match(selector)
		if m:
			attr, = m.groups()
			for element in select(soup, selector):
				if log:
					print "removing %s[%s]" % (element.name, attr)
				element.attrs = [item for item in element.attrs if item[0] != attr]
				changes = True

		else:
			m = REC_CLASS.match(selector)
			if m:
				tag, cls = m.groups()
				selector = (tag or '') + u'[class]'

				for element in select(soup, selector):

					for i, (attr, value) in enumerate(element.attrs):
						if attr == u'class':
							class_index = i

					classes = filter(None, element.attrs[class_index][1].split(' '))
					try:
						classes.remove(cls)
					except ValueError:	# not in list
						pass
					else:
						if log:
							print "removing %s.%s" % (element.name, cls)
						element.attrs[class_index] = (u'class', ' '.join(classes))
						changes = True

	if changes:
		return soup.prettify(encoding=soup.fromEncoding or soup.originalEncoding), changes
	else:
		return html, changes
Exemplo n.º 25
0
 def parse_base(self):
     for page in range(self.pages):
         base_url = "http://www.kissfm.ua/news.html?p=%s" % str(page)
         doc = BeautifulSoup(urlopen(base_url))
         for comm in select(doc, "div.news-item-content"):
             elem = {}
             for item in select(comm, "a.main-item-title"):
                 elem["link"] = item["href"]
                 elem["title"] = item.string
             for item in select(comm, "img"):
                 elem["thumb"] = item["src"]
             for item in select(comm, "div.news-block-item-date"):
                 elem["date"] = item.string.strip()
                         
             self.structure.append(elem)
Exemplo n.º 26
0
def parseStance(stance):
	issue = select(stance, "div.issue div.issuetext")[0].text
	e = select(stance, "div.quotelink")[0]
	if e.text:
		attrs = map(attrSplit, e.text.split("\" quote"))
		attrMap = {}
		for attr in attrs:
			if len(attr) == 2: attrMap[attr[0]] = attr[1]
		value = attrMap["stand"]
		source = attrMap["src"]
	else:
		value = e["quotestand"]
		source = e["quotesrc"]
	value = value == "colgreencheckmark"
	return [issue, value, source]
Exemplo n.º 27
0
 def test_items_in_id(self):
     els = select(self.soup, 'div#inner p')
     self.assertEqual(len(els), 3)
     for el in els:
         self.assertEqual(el.name, 'p')
     self.assertEqual(els[1]['class'], [u'onep'])
     self.assert_(not els[0].has_key('class'))
Exemplo n.º 28
0
    def verifyHtml(self,html):
        #reg = '((class|id)=\")[a-zA-Z0-9\-\_\s]*({})[a-zA-Z0-9\-\_\s]*(\")'

        for tag in self.tagObjects:
            if tag.found:
                continue
            for i,t in enumerate(tag.tags):
                if t.find('*')!=-1:
                    tag.found = True
                    continue
                if t.find(':')!=-1:
                    tag.found = True
                    continue

                #print 'finding matches for :',t
                matches = []
                try:
                    matches = select(html,t)
                except IndexError as e:
                    #print 'Error finding matches',e
                    tag.found = True
                    tag.tagsFound[i] = True 

                if len(matches)>0:
                    tag.found = True
                    tag.tagsFound[i] = True 
                    #print 'Found Match(s)'
                else:
                    pass
Exemplo n.º 29
0
def legendary_necklace_stage(soup):
    necklace = select(soup, "div.necklace")

    necklace = select(necklace[0], "img")

    if necklace:
        necklace = necklace[0]
    else:
        return False

    data_parts = necklace['item-data'].split(".")

    if data_parts[0] == "3300981":
        return data_parts[1]

    return False
Exemplo n.º 30
0
def get_statement_urls(browser, base_url, is_old_statement):
  """Finds the URLs for each other statement, given a base URL for a statement."""

  STATEMENT_SELECT_ID = '#ctl00_ContentInfo_HeaderInformation_drpStatementDates option'

  # There is a <select> on the page with the other statements linked.
  response = browser.open(base_url)

  response_data = response.read()

  soup = BeautifulSoup(response_data)
  statement_dates = select(soup, STATEMENT_SELECT_ID)
  if len(statement_dates) == 0:
    print "Couldn't find statement selector at %s" % base_url
    sys.exit(1)

  statement_urls = []
  if is_old_statement:
    statement_base_url = OLD_URL_STATEMENT
  else:
    statement_base_url = NEW_URL_STATEMENT

  for statement_date in statement_dates:
    url = statement_base_url % urllib.quote(statement_date['value'].split(' ')[0], '')
    statement_urls.append(url)

  time.sleep(CRAWL_DELAY)
  return statement_urls
Exemplo n.º 31
0
def parse_statement_flights(soup, opts=None):
  """Takes a BeautifulSoup instance and finds what flights, if any, it contains."""

  notes = select(soup, "span.Notes")
  if len(notes) % 11 == 0:
    delta = 11
  else:
    delta = 10

  # Every 10 or 11 "Notes" is one entry. 
  entries = [notes[i:i+delta] for i in range(0, len(notes), delta)]

  trips = []
  for entry in entries:
    values = map(lambda x: x.text.strip(), entry)
    num_empty_values = values.count('')
    
    # Entries with lots of blanks are mile transfers or car rentals,
    # so don't include them unless they are desired.
    if num_empty_values > 3:
      if opts and opts.include_non_flights:
        trips.append(values)
    else: 
      # For flights, also try to look up IATA codes since United doesn't provide them.
      if not opts.skip_iata_codes:
        parser = united_utils.UnitedAirportParser()
      try:
        codes = parser.get_iata_codes(values[2])
        values.extend(codes)
      except ValueError:
        values.extend(('',''))
      trips.append(values)
  return trips
Exemplo n.º 32
0
def add_footnotes_to_sections(sections_strings, footnote_anchors, footnotes):
    j = 0
    for i, ss in enumerate(sections_strings):
        ssoup = BeautifulSoup(ss)
        selector = '.sdfootnoteanc'
        footnote_anchors = select(ssoup, selector)
        #print 'For section ', i, ' found footnotes ' \
        #       '\n'.join([str(anc) for anc in footnote_anchors])

        for k in range(len(footnote_anchors)):
            anchor = footnote_anchors[k]
            footnote = footnotes[j]

            if '#' + anchor['name'] == footnote.p.a['href'] \
                    and '#' + footnote.p.a['name'] == anchor['href']:
                #print 'Found match for footnote', j, \
                #       ' in section ', i, ' anchor ', k
                pass
            else:
                print 'ERROR: wrong match for footnote', j, \
                        ' with anchor ', k, ' from section ', i

            sections_strings[i] = sections_strings[i] + str(footnote)

            j += 1
Exemplo n.º 33
0
 def test_items_in_id(self):
     els = select(self.soup, "div#inner p")
     self.assertEqual(len(els), 3)
     for el in els:
         self.assertEqual(el.name, "p")
     self.assertEqual(els[1]["class"], "onep")
     self.assert_(not els[0].has_key("class"))
Exemplo n.º 34
0
 def get_resources(self, doc):
     resources = []
     for a in select(doc, 'a'):
         url = a.get('href')
         title = a.get('title').lower()
         resources.append((url, title))
     return resources
def get_photos(race, bibs, path_prefix=''):
    for bib in bibs:
        bib = bib.strip()
        bib_url = FINISHERPIX_URL % (race, bib)
        photo_list_html = requests.get(bib_url)
        soup = Soup(photo_list_html.text)
        photo_list = select(
            soup,
            '#photocommerce-photos-gallery .photocommerce-gallery-photo img')
        for photo in photo_list:
            photo_url = photo['src']
            photo_filename = photo_url.split('/')[-1]
            bib_dir_path = os.path.join(path_prefix, race, bib)
            if path_prefix and not os.path.exists(path_prefix):
                os.makedirs(path_prefix)
            if not os.path.exists(os.path.join(path_prefix, race)):
                os.makedirs(os.path.join(path_prefix, race))
            if not os.path.exists(bib_dir_path):
                os.makedirs(bib_dir_path)
            r = requests.get(photo_url, stream=True)
            if r.status_code == 200:
                with open(os.path.join(bib_dir_path, photo_filename), 'wb') \
                        as f:
                    for chunk in r.iter_content(1024):
                        f.write(chunk)
                    print('Downloaded http:%s to %s' %
                          (photo_url, os.path.join(bib_dir_path,
                                                   photo_filename)))
        return bib_dir_path
Exemplo n.º 36
0
def isohunt_search(q):
    #Query the isohunt search engine and get the results HTML
    q = urllib.quote(q)
    soup = Soup(open_url('http://isohunt.com/torrents/?ihq=%s' % q),
                convertEntities='html',
                markupMassage=hexentityMassage)
    anchors = select(soup, 'a[id^link]')
    anchors = filter(lambda a: a.parent.name == 'td', anchors)
    results = {}
    for a in anchors:
        if str(a.contents[0]) != '0':
            a = Soup(a.renderContents().split("<br />").pop())
            result = ' '.join([
                unicode(node.renderContents())
                if type(node) != NavigableString else unicode(node)
                for node in a.contents
            ])
            result = scene_cleanup(result)
            if result not in results.keys():
                results[result] = 1
            else:
                results[result] += 1

    results = sorted(results.iteritems(), key=operator.itemgetter(1))
    res = []
    for r in results:
        res = [r[0]] + res
    return res
Exemplo n.º 37
0
 def assertSelects(self, selector, expected_ids):
     el_ids = [el['id'] for el in select(self.soup, selector)]
     el_ids.sort()
     expected_ids.sort()
     self.assertEqual(expected_ids, el_ids,
             "Selector %r, expected %r, got %r" % (selector, expected_ids,
                 el_ids))
Exemplo n.º 38
0
def get_statement_urls(browser, base_url, is_old_statement):
    """Finds the URLs for each other statement, given a base URL for a statement."""

    STATEMENT_SELECT_ID = '#ctl00_ContentInfo_HeaderInformation_drpStatementDates option'

    # There is a <select> on the page with the other statements linked.
    response = browser.open(base_url)

    response_data = response.read()

    soup = BeautifulSoup(response_data)
    statement_dates = select(soup, STATEMENT_SELECT_ID)
    if len(statement_dates) == 0:
        print "Couldn't find statement selector at %s" % base_url
        sys.exit(1)

    statement_urls = []
    if is_old_statement:
        statement_base_url = OLD_URL_STATEMENT
    else:
        statement_base_url = NEW_URL_STATEMENT

    for statement_date in statement_dates:
        url = statement_base_url % urllib.quote(
            statement_date['value'].split(' ')[0], '')
        statement_urls.append(url)

    time.sleep(CRAWL_DELAY)
    return statement_urls
Exemplo n.º 39
0
    def _apply_styles(self):
        """Steps through CSS rules and applies each to all the proper elements
        as @style attributes prepending any current @style attributes.
        """
        rules = self.stylesheet.cssRules.rulesOfType(1)
        elem_prop_map = {}
        elem_style_map = {}

        # build up a property list for every styled element
        for rule in rules:
            # select elements for every selector
            selectors = map(lambda s: s.strip(), rule.selectorText.split(','))
            elements = []

            for selector in selectors:
                try:
                    elements += select(self.soup, selector)
                except SelectorNotSupportedException, ex:
                    if self.ingore_unsupported_selectors:
                        pass
                    else:
                        raise

            # build prop_list for each selected element
            for elem in elements:
                if elem not in elem_prop_map:
                    elem_prop_map[elem] = []
                elem_prop_map[elem].append({
                    'specificity': self._get_rule_specificity(rule),
                    'props': rule.style.getProperties(),
                })
Exemplo n.º 40
0
 def get_resources(self, doc):
     resources = []
     for a in select(doc, 'a'):
         url = a.get('href')
         title = a.get('title').lower()
         resources.append((url, title))
     return resources
Exemplo n.º 41
0
 def assertSelects(self, selector, expected_ids):
     el_ids = [el['id'] for el in select(self.soup, selector)]
     el_ids.sort()
     expected_ids.sort()
     self.assertEqual(
         expected_ids, el_ids, "Selector %s, expected [%s], got [%s]" %
         (selector, ', '.join(expected_ids), ', '.join(el_ids)))
Exemplo n.º 42
0
def add_footnotes_to_sections(sections_strings, footnote_anchors, footnotes):
    j = 0
    for i, ss in enumerate(sections_strings):
        ssoup = BeautifulSoup(ss)
        selector = '.sdfootnoteanc'
        footnote_anchors = select(ssoup, selector)
        #print 'For section ', i, ' found footnotes ' \
        #       '\n'.join([str(anc) for anc in footnote_anchors])

        for k in range(len(footnote_anchors)):
            anchor = footnote_anchors[k]
            footnote = footnotes[j]

            if '#' + anchor['name'] == footnote.p.a['href'] \
                    and '#' + footnote.p.a['name'] == anchor['href']:
                #print 'Found match for footnote', j, \
                #       ' in section ', i, ' anchor ', k
                pass
            else:
                print 'ERROR: wrong match for footnote', j, \
                        ' with anchor ', k, ' from section ', i

            sections_strings[i] = sections_strings[i] + str(footnote)

            j += 1
Exemplo n.º 43
0
 def test_items_in_id(self):
     els = select(self.soup, 'div#inner p')
     self.assertEqual(len(els), 3)
     for el in els:
         self.assertEqual(el.name, 'p')
     self.assertEqual(els[1]['class'], 'onep')
     self.assert_(not els[0].has_key('class'))
Exemplo n.º 44
0
def parse_statement_flights(soup, opts=None):
    """Takes a BeautifulSoup instance and finds what flights, if any, it contains."""

    notes = select(soup, "span.Notes")
    if len(notes) % 11 == 0:
        delta = 11
    else:
        delta = 10

    # Every 10 or 11 "Notes" is one entry.
    entries = [notes[i:i + delta] for i in range(0, len(notes), delta)]

    trips = []
    for entry in entries:
        values = map(lambda x: x.text.strip(), entry)
        num_empty_values = values.count('')

        # Entries with lots of blanks are mile transfers or car rentals,
        # so don't include them unless they are desired.
        if num_empty_values > 3:
            if opts and opts.include_non_flights:
                trips.append(values)
        else:
            # For flights, also try to look up IATA codes since United doesn't provide them.
            if not opts.skip_iata_codes:
                parser = united_utils.UnitedAirportParser()
            try:
                codes = parser.get_iata_codes(values[2])
                values.extend(codes)
            except ValueError:
                values.extend(('', ''))
            trips.append(values)
    return trips
Exemplo n.º 45
0
def imdb_movie(url):
    response = open_url(url, html=False)
    soup = Soup(response.read(),
                convertEntities='html',
                markupMassage=hexentityMassage)

    #Parse the HTML, fetch movie names and their corresponding page URLs
    h1 = select(soup, 'h1.header')
    return {'url': url, 'title': h1[0].contents[0].strip()}
Exemplo n.º 46
0
    def get_content(url, num):
        if num[0] >= max_num:
            return
        response = urllib2.urlopen(url)
        msg = response.read()
        soup = BS(''.join(msg))

        news_body = select(soup, 'div.blkContainerSblk')
        if len(news_body) == 1:
            num[0] = num[0] + 1
            title = select(soup, 'h1#artibodyTitle')
            time = select(soup, 'span#pub_date')
            content = select(soup, 'div#artibody.blkContainerSblkCon')
            founds.append({u'url': url, u'timestamp': datetime.now()})

        links = select(soup, 'a')['href']
        for link in links:
            if link.find("news.sina.com.cn") != -1:
                get_content(url, num)
Exemplo n.º 47
0
def search(query):
    query = query.replace(' ', '%20')
    query = query.replace('&', '')  # don't want & in get query parameters
    url = base_url + query
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    products = []
    food_descriptions = soupselect.select(soup, 'div.food_description')
    for food_description in food_descriptions:
        try:
            anchors = soupselect.select(food_description, 'a')
            match = re.search(">.+<\/a>", str(anchors[0]))
            name = match.group(0).replace('</a>', '').replace('>', '')
            match = re.search(">.+<\/a>", str(anchors[1]))
            brand = match.group(0).replace('</a>', '').replace('>', '')
            product = {}
            product['name'] = name
            product['brand'] = brand
            products.append(product)
        except:
            pass

    index = 0
    nutritional_info = soupselect.select(soup, 'div.nutritional_info')
    nutritional_data = soupselect.select(soup, 'div.nutritional_facts')
    # print(requests.get('https://www.myfitnesspal.com/food/search?search=chick%20fil%20a%20nuggets').text)
    for nutrition in nutritional_info:
        try:
            product = products[index]
            matches = re.finditer("<label>.+<\/label>.+(,|[\s|\t|\n]*<\/div>)",
                                  str(nutrition))
            for match in matches:
                s = match.group(0)
                label = s[:s.index('</label>')]
                label = label.replace('<label>', '').replace(':', '').strip()
                label_value = s[s.index("</label>"):]
                label_value = label_value.replace('</label>', '').replace(
                    ',', '').replace('</div>', '').strip()
                product[label] = label_value
            index += 1
        except:
            pass
    return products[:5]
Exemplo n.º 48
0
def scrape(postcode,page=1):
	if page == 1:	print "*************** %s *******************"%postcode	
	
	post_url = 'http://finddrivinginstructor.direct.gov.uk/DSAFindNearestWebApp/findNearest.form?postcode=%s&pageNumber=%s'%(postcode,page)
	resp = requests.get(post_url).text
	
	if len(resp) < 8000:
		print "Invalid post codes"
		return 'Invalid'

	soup = BeautifulSoup(resp)
	results_list = select(soup, 'ul.results-list li')
	
	if len(results_list) == 0:		
		print "No more pages left."
		return "no pages left"
	
	print "Page %s"%page

	for i in results_list:
		name = select(i,'h3')[0].get_text()
		detail1 = select(i,'div.instructor-details')[0]
		mail = select(detail1,'a')[0].get('href').split(":")[-1]
		phone = select(detail1,'span')[0].get_text()
		detail2 = select(i,'div.instructor-details')[1]
		
		try:
			select(detail2,'span.cpd')[0]
			cpd = True
		except IndexError:
			cpd = False
			pass
		try:
			select(detail2,'span.cop')[0]
			cop = True
		except IndexError:
			cop = False
		print name,mail,phone,cpd,cop
		with open("go.txt", "a") as myfile1:
			myfile1.write("%s|%s|%s|%s|%s|%s\n"%(postcode,name,mail,phone,cpd,cop))

	return scrape(postcode,page+1)
Exemplo n.º 49
0
def parse_row(tr):
    if not tr:
        return None
    address = select(tr, '.address-tag a')
    if len(address) != 3:
        warning(tr)
        return None
    tx_id = address[0].contents[0]
    tx_from = address[1].contents[0]
    tx_to = address[2].contents[0]
    tds = select(tr, 'td')
    val = tds[6].text
    num_val = float(val.replace('Ether', '').replace(
        ' ', '')) if 'Ether' in val else 0
    return {
        'tx_id': strip(tx_id),
        'from': strip(tx_from),
        'to': strip(tx_to),
        'value': num_val
    }
Exemplo n.º 50
0
def download_subtitle_file_from_episode(url, file_name):
    # first extract the download url
    r2 = requests.get(url)
    requestText = str(r2.text.encode("utf-8"))
    notUnicode = requestText.decode('unicode-escape')
    notUnicodeStr = str(notUnicode.encode("utf-8"))
    #print notUnicodeStr
    soup = BeautifulSoup(notUnicodeStr, 'html.parser')
    download_url = 'https://subscene.com' + select(soup, '#downloadButton')[0]['href']
    print download_url
    urllib.urlretrieve (download_url, file_name + ".zip")
Exemplo n.º 51
0
def get_family(character_id):
    # print character_id
    file_name = character_id.split("/")[-1]
    wikia = BeautifulSoup(
        open("data/wikia/characters/{0}".format(file_name), "r"),
        "html.parser")
    family_element = [
        tag for tag in select(wikia, 'h3') if tag.text == "Family"
    ]
    if len(family_element) > 0:
        family = family_element[0].next_sibling.next_sibling
        collapsed = select(family, "div.mw-collapsed")

        if len(collapsed) > 0:
            return extract_houses(
                select(family, "div.mw-collapsed")[0].contents)
        else:
            return extract_houses(family.contents)
    else:
        return []
Exemplo n.º 52
0
 def parse(self):
     if not self.soup:
         return
     book = {}
     for a in select(self.soup, 'ul.detailsList li h3 a'):
         link = self.url.replace('/cz/', '') + a['href']
         xa = str(select(a, 'span.given-name')[0].string)
         xb = str(select(a, 'span.family-name')[0].string)
         name = u"%s %s" % (
             # select(a, 'span.given-name')[0].string.capitalize(),
             # select(a, 'span.family-name')[0].string.capitalize()
             xa.decode('utf-8'),
             xb.decode('utf-8')
         )
         key = name.split(u' ')
         key.reverse()
         key = slugify(u" ".join(key))
         book[key] = (name, link)
     keys = sorted(book.keys())
     self.data = [book[k] for k in keys]
Exemplo n.º 53
0
def remove_footnotes_from_last_section(sections_strings):
    last_ss = sections_strings[-1]
    last_ssoup = BeautifulSoup(last_ss)
    footnote_anchors = find_footnotes_and_anchors(last_ssoup)

    last_ssfootnotes = []
    for i in range(len(footnote_anchors)):
        selector = '#sdfootnote%s' % (i + 1)
        last_ssfootnotes.extend(select(last_ssoup, selector))

    for f in last_ssfootnotes:
        f.extract()
Exemplo n.º 54
0
def download_subtitles():
    #seasons = ['https://subscene.com/subtitles/game-of-thrones-first-season', 'https://subscene.com/subtitles/game-of-thrones-second-season', 'https://subscene.com/subtitles/game-of-thrones-third-season', 'https://subscene.com/subtitles/game-of-thrones-fourth-season', 'https://subscene.com/subtitles/game-of-thrones-fifth-season-2015']
    seasons = ['https://subscene.com/subtitles/game-of-thrones-fourth-season']
    season_counter = 4
    for season in seasons:
        r2 = requests.get(season)
        requestText = str(r2.text.encode("utf-8"))
        notUnicode = requestText.decode('unicode-escape')
        notUnicodeStr = str(notUnicode.encode("utf-8"))
        #print notUnicodeStr
        soup = BeautifulSoup(notUnicodeStr, 'html.parser')
        rows = select(soup, '.a1')

        for episode_number in range(1,2):
            if(episode_number < 10):
                file_name = 'S0' + str(season_counter) + 'E0' + str(episode_number)
            else:
                file_name = 'S0' + str(season_counter) + 'E' + str(episode_number)

            print file_name

            for row in rows:
                #print row
                soup = BeautifulSoup(str(row.encode("utf-8")), 'html.parser')
                spans = select(soup, 'a span')
                language = spans[0].text.strip()
                title = spans[1].text.strip()
                if(language == 'English'):
                    if(file_name in title):

                        print title
                        #print 
                        aTag = select(soup, 'a')
                        url = 'https://subscene.com' + aTag[0]['href']
                        print url
                        download_subtitle_file_from_episode(url, file_name)
                        break


        season_counter += 1
Exemplo n.º 55
0
 def files_info(self):
     html = Soup(urllib.urlopen(self.url))
     urls = select(html, 'li.file')
     files_info = {}
     for f in urls:
         fileinfo = {
             'url': self.get_url(f),
             'name': self.get_name(f),
             'size': self.get_size(f),
             'modified': self.get_modified(f),
         }
         files_info[fileinfo['name']] = fileinfo
     return files_info
Exemplo n.º 56
0
 def get_size(self, html):
     # TODO does not exist for folders
     size_desc = select(html, 'span.file_size')
     if not size_desc:
         return 0
     size_desc = size_desc[0].text
     size = int(re.search('([\d]+)', size_desc).groups()[0])
     if "gb" in size_desc.lower():
         size = int(float(size) * 1024 * 1024 * 1024)
     elif "mb" in size_desc.lower():
         size = int(float(size) * 1024 * 1024)
     elif "kb" in size_desc.lower():
         size = int(float(size) * 1024)
     return size
Exemplo n.º 57
0
def propagate_styles(soup):
    style_contents = str(soup.html.head.style.contents[0])
    styles = style_contents.split('\n')
    for s in styles:
        s = s.strip()
        if s == '' or s == '<!--' or s == '-->':
            continue
        if s.startswith('@page'):
            continue
        selector, rest = s.split('{', 1)
        style = rest.split('}', 1)[0]
        for tag in select(soup, selector.lower()):
            if tag.has_key('style'):
                tag['style'] += style
            else:
                tag['style'] = style
Exemplo n.º 58
0
def get_houses(character_id):
    print character_id
    file_name = character_id.split("/")[-1]
    wikia = BeautifulSoup(
        open("data/wikia/characters/{0}".format(file_name), "r"),
        "html.parser")
    allegiance_element = [
        tag for tag in select(wikia, 'h3') if tag.text == "Allegiance"
    ]

    if len(allegiance_element) > 0:
        houses_elements = allegiance_element[
            0].next_sibling.next_sibling.contents
        return extract_houses(houses_elements)
    else:
        return []