def get_items(string_to_search): data = get_data_from_server(string_to_search) soup = BeautifulSoup(''.join(data)) list = [] table = soup.find('table',{'class':'mpitems'}) rows = deque(table.findAll('tr')) rows.popleft() for theItem in rows: cols = theItem.findAll('td',recursive=False) newItem = Item() image = cols[0].find('img') if image: newItem.image=(image['src']) titleSpan = cols[1].find('span',{'class':'br_item_title'}) if titleSpan: newItem.title = unicode(titleSpan.a.string) newItem.link='http://www.discogs.com'+titleSpan.a['href'] newItem.fromPage='Discogs' priceSpan = cols[4].find('span',{'class':'price'}) if priceSpan: newItem.price=unicode(priceSpan.string) list.append(newItem) return list
def get(self): term_to_index=self.request.get('t') link_to_index=self.request.get('l') item=Item() item.title=term_to_index item.link=link_to_index index_item_and_store_item(item) template_values = {} path = os.path.join(os.path.dirname(__file__),'templates/main.html') self.response.out.write(template.render(path,template_values))
def parse_data_from_server(html_data): soup = BeautifulSoup(html_data) list = [] items = soup.findAll('div', {'class':'item'}) for theItem in items: newItem = Item() name = theItem.find('a',{'class':'nombre sin_subrayar'}) if name: newItem.title=unicode(name.string) newItem.price=''#get_price(theItem) newItem.link="http://www.todocoleccion.net"+name['href'] newItem.image=theItem.find('div',{'class':'foto'}).img['src'] newItem.fromPage='TodoColeccion' list.append(newItem) return list
def parse_item(self, response, url): """ parse the response of a `python requests.get()` @param response - an object of `requests.get()` @param url - current url object that has been crawled @return item """ soup = bs4.BeautifulSoup(response.text, 'html.parser') # print("title: %s" % soup.title) it = Item() for link in soup.find_all('meta'): if link.get('name') == 'description': it.name_jp = link.get('content') it.name_zh = trans.translate(it.name_jp) it.link = response.url # print("item name_zh: %s" % it.name_zh) self.feed_new_urls(soup, url) return it