def main(): data = getpage(saleurl) soup = BeautifulSoup(data) res = soup.findAll('h2', {"class": "wdk_shopproduct-title"}) lst = [] for i in res: for aim in AIM_SIZE: if i.string.find(aim) >= 0: lst.append("<li>%s</li>" % i.string) infos = htmlinfo(lst) if infos: mailer.send(URL_ADDR + infos, tolist, sub='skate sale')
def get_active_mps(): # returns list of BIDs ids = [] try: logger.info("Fetching active MP list...") active_mp_list = getpage(ACTIVE_MP_URL) soup = BeautifulSoup(active_mp_list, "lxml") except: # há muitos erros http ou parse que podem ocorrer logger.warning('Active MP page could not be fetched.') raise table_mps = soup.find('table', 'ARTabResultados') mps = table_mps.findAll('tr', 'ARTabResultadosLinhaPar') mps += table_mps.findAll('tr', 'ARTabResultadosLinhaImpar') for mp in mps: mpurl = mp.td.a['href'] mp_bid = int(mpurl[mpurl.find('BID=') + 4:]) ids.append(mp_bid) logger.info('Active MP list created.') return ids
def get_active_mps(): # returns list of BIDs ids = [] try: logger.info("Fetching active MP list...") active_mp_list = getpage(ACTIVE_MP_URL) soup = BeautifulSoup(active_mp_list, "lxml") except Exception: # há muitos erros http ou parse que podem ocorrer logger.warning('Active MP page could not be fetched.') raise table_mps = soup.find('table', 'ARTabResultados') mps = table_mps.findAll('tr', 'ARTabResultadosLinhaPar') mps += table_mps.findAll('tr', 'ARTabResultadosLinhaImpar') for mp in mps: mpurl = mp.td.a['href'] mp_bid = int(mpurl[mpurl.find('BID=') + 4:]) ids.append(mp_bid) logger.info('Active MP list created.') return ids
def main(): for (url, threshold, product_name) in Targets: fprice = product_name + '.old' fullname = cur_fname(fprice) old_price = get_price(fullname) res = getpage(url) soup = BeautifulSoup.BeautifulSoup(res) sp = soup.find('span', id='priceblock_ourprice') price = float(sp.text[1:]) realprice = price * .85 - get_coupon(soup) if old_price != realprice: replace_file(fullname, str(realprice)) if old_price == 0 or (realprice < threshold and realprice < old_price): url_addr = '<a href="%s" >%s</a></br>' % ((url, ) * 2) subject = '%s < %s' % (product_name, threshold) mailer.send(url_addr + 'realprice: %s\n old_price: %s' % (realprice, old_price), tolist, sub=subject)
def process_mp(i): url = MP_BIO_URL_FORMATTER % i soup = BeautifulSoup(getpage(url), "lxml") name = soup.find('span', id=RE_NAME) if name: # logger.info("Got hit for ID %d" % i) short = soup.find('span', id=RE_SHORT) birthdate = soup.find('span', id=RE_BIRTHDATE) party = soup.find('span', id=RE_PARTY) occupation = soup.find('div', id=RE_OCCUPATION) education = soup.find('div', id=RE_EDUCATION) current_jobs = soup.find('div', id=RE_CURRENT_JOBS) # ;) jobs = soup.find('div', id=RE_JOBS) # ;) awards = soup.find('div', id=RE_AWARDS) coms = soup.find('div', id=RE_COMS) mandates = soup.find('table', id=RE_MANDATES) image_src = soup.find('td', {'class': 'tdFotoBio'}).img['src'] mprow = OrderedDict() mprow['id'] = i mprow['name'] = name.text mprow['url_parlamento'] = url # Ver se é um dos nomes que devemos rectificar if short.text in SHORTNAME_REPLACES: t = SHORTNAME_REPLACES[short.text] else: t = short.text # Casos específicos de desambiguação if mprow['id'] == 4194: # Jorge Costa -> Jorge Duarte Costa t = "Jorge Duarte Costa" elif t == "Carla Tavares" and mprow['id'] == 1634: t = "Carla Tavares Gaspar" elif t == u"António Rodrigues" and mprow['id'] == 1132: t = u"António Costa Rodrigues" elif t == "Paulo Neves" and mprow['id'] == 1360: t = "Paulo Santos Neves" elif t == "Carlos Pereira" and mprow['id'] == 29: t = "Carlos Lopes Pereira" t = t.replace(" ", " ") mprow['shortname'] = t mprow['slug'] = slugify(t) mprow['url_democratica'] = 'http://demo.cratica.org/deputados/%s/' % slugify(t) if birthdate: mprow['birthdate'] = birthdate.text if party: mprow['party'] = party.text if education: mprow['education'] = extract_details(education) if occupation: mprow['occupation'] = extract_details(occupation) if jobs: mprow['jobs'] = extract_multiline_details(jobs) if current_jobs: mprow['current_jobs'] = extract_multiline_details(current_jobs) if coms: commissions = extract_details(coms) if commissions: mprow['commissions'] = commissions if awards: mprow['awards'] = extract_multiline_details(awards) if mandates: # TODO: this block may take advantage of the new functions mprow['mandates'] = [] for each in mandates.findAll('tr')[1:]: leg = each.findAll('td') l = leg[0].text number, start, end = parse_legislature(l) end = end.rstrip(']\n') mandate = OrderedDict() mandate['legislature'] = number mandate['party'] = leg[5].text mandate['constituency'] = leg[4].text mandate['start_date'] = start mandate['end_date'] = end if leg[2].find("a"): # atividade parlamentar url = leg[2].find("a")['href'] if not url.startswith("http://"): url = "http://www.parlamento.pt" + url mandate['activity_url'] = url if leg[3].find("a"): # registo de interesses url = leg[3].find("a")['href'] if not url.startswith("http://"): url = "http://www.parlamento.pt" + url mandate['interest_url'] = url mprow['mandates'].append(mandate) if image_src: mprow['image_url'] = image_src # mprow['url_hemiciclo'] = 'http://hemiciclo.pt/%s/%d/' % (mprow['party'].lower().replace("-pp", ""), i) logger.info("Scraped MP: %s" % short.text) return mprow
def process_mp(i): logger.debug("Trying ID %d..." % i) url = MP_BIO_URL_FORMATTER % i soup = BeautifulSoup(getpage(url), "lxml") name = soup.find('span', id=RE_NAME) if name: name = name.text short = soup.find('span', id=RE_SHORT) birthdate = soup.find('span', id=RE_BIRTHDATE) party = soup.find('span', id=RE_PARTY) occupation = soup.find('div', id=RE_OCCUPATION) education = soup.find('div', id=RE_EDUCATION) current_jobs = soup.find('div', id=RE_CURRENT_JOBS) # ;) jobs = soup.find('div', id=RE_JOBS) # ;) awards = soup.find('div', id=RE_AWARDS) coms = soup.find('div', id=RE_COMS) mandates = soup.find('table', id=RE_MANDATES) image_src = soup.find('td', {'class': 'tdFotoBio'}).img['src'] if name in NAME_REPLACES: name = NAME_REPLACES[name] mprow = OrderedDict() mprow['id'] = i mprow['name'] = name mprow['url_parlamento'] = url # Ver se é um dos nomes que devemos rectificar if short.text in SHORTNAME_REPLACES: t = SHORTNAME_REPLACES[short.text] else: t = short.text # Casos específicos de desambiguação if t == "Jorge Costa" and party.text == 'BE': t = "Jorge Duarte Costa" elif t == "Carla Tavares" and mprow['id'] == 1634: t = "Carla Tavares Gaspar" elif t == u"António Rodrigues" and mprow['id'] == 1132: t = u"António Costa Rodrigues" elif t == "Paulo Neves" and mprow['id'] == 1360: t = "Paulo Santos Neves" elif t == "Carlos Pereira" and mprow['id'] == 29: t = "Carlos Lopes Pereira" mprow['shortname'] = t mprow['slug'] = slugify(t) mprow['url_democratica'] = DEMOCRATICA_URL % slugify(t) if birthdate: mprow['birthdate'] = birthdate.text if party: mprow['party'] = party.text if education: mprow['education'] = extract_details(education) if occupation: mprow['occupation'] = extract_details(occupation) if jobs: mprow['jobs'] = extract_multiline_details(jobs) if current_jobs: mprow['current_jobs'] = extract_multiline_details(current_jobs) if coms: commissions = extract_details(coms) if commissions: mprow['commissions'] = commissions if awards: mprow['awards'] = extract_multiline_details(awards) if mandates: # TODO: this block may take advantage of the new functions mprow['mandates'] = [] for each in mandates.findAll('tr')[1:]: leg = each.findAll('td') l = leg[0].text number, start, end = parse_legislature(l) end = end.rstrip(']\n') mandate = OrderedDict() mandate['legislature'] = number mandate['party'] = leg[5].text mandate['constituency'] = leg[4].text mandate['start_date'] = start mandate['end_date'] = end if leg[2].find("a"): # atividade parlamentar url = leg[2].find("a")['href'] if not url.startswith("http://"): url = "http://www.parlamento.pt" + url mandate['activity_url'] = url if leg[3].find("a"): # registo de interesses url = leg[3].find("a")['href'] if not url.startswith("http://"): url = "http://www.parlamento.pt" + url mandate['interest_url'] = url mprow['mandates'].append(mandate) if image_src: mprow['image_url'] = image_src logger.info("Scraped MP: %s" % short.text) return mprow