def crawl_top_circulations(type, query): results = dict() for content in get_pages(query): table = build_soup(content).find('table', 'listview') books = list() year_tag = table.get('summary') year, tag = re.findall('(\d+)年(\S+)',year_tag)[0] for row in table.find_all('tr')[1:]: try: rk, title, ref, cnt = row.findChildren() except ValueError: # for year 2003, there's no <a> tag rk, title, cnt = row.findChildren() ref = None circulaion = Circulation( type=type, bookname=title.text.strip(' /'), url=ref.get('href') if ref else None, rank=rk.text, tag=tag, year=year, count=cnt.text ) db.session.add(circulaion) db.session.commit()
def get_circulation_links(): return [ ({'text': a.text, 'href': a.get('href')}, urljoin(nthu_library_url.top_circulations, a.get('href'))) for resp in get_pages([ nthu_library_url.top_circulations, nthu_library_url.top_circulations_bc2007]) for a in build_soup(resp).find(id='cwrp').find_all('a') ]
def crawl_lost_objects(data): soup = post_page(nthu_library_url.lost_found_url, data=data) lost_items = list() for item in build_soup(soup).select('table > tr')[1:]: lost_items.append({ 'id': item.select('td:nth-of-type(1)')[0].text, 'time': item.select('td:nth-of-type(2)')[0].text, 'place': item.select('td:nth-of-type(3)')[0].text, 'description': item.select('td:nth-of-type(4)')[0].text, }) return lost_items
def crawl_lost_objects(data): soup = post_page(nthu_library_url.lost_found_url, data=data) lost_items = list() for item in build_soup(soup).select('table > tr')[1:]: r = [s.strip() for s in item.select('td:nth-of-type(4)')[0].text.split('\r\n')] sysid = re.search('\d+', r[1]) lost_items.append({ 'id': item.select('td:nth-of-type(1)')[0].text, 'time': item.select('td:nth-of-type(2)')[0].text, 'place': item.select('td:nth-of-type(3)')[0].text, 'description': r[0], 'system_id': sysid.group() if sysid else None }) return lost_items
def crawl_top_circulations(query): results = dict() for content in get_pages(query): table = build_soup(content).find('table', 'listview') books = list() for row in table.find_all('tr')[1:]: try: rk, title, ref, cnt = row.findChildren() except ValueError: # for year 2003, there's no <a> tag rk, title, cnt = row.findChildren() books.append({ 'rank': rk.text, 'book_name': title.text.strip(' /'), 'link': ref.get('href') if ref else None, 'circulations': cnt.text }) results[table.get('summary')] = books return results
def crawl_top_circulations(query): results = dict() for content in get_pages(query): table = build_soup(content).find('table', 'listview') books = list() for row in table.find_all('tr')[1:]: try: rk, title, ref, cnt = row.findChildren() except ValueError: # for year 2003, there's no <a> tag rk, title, cnt = row.findChildren() books.append({ 'rank': rk.text, 'bookname': title.text.strip(' /'), 'link': ref.get('href') if ref else None, 'times': cnt.text }) results[table.get('summary')] = books return results
def crawl_top_circulations(rank_type, query): results = dict() for content in get_pages(query): table = build_soup(content).find('table', 'listview') books = list() year_tag = table.get('summary') year, tag = re.findall('(\d+)年(\S+)', year_tag)[0] for row in table.find_all('tr')[1:]: try: rk, title, ref, cnt = row.findChildren() except ValueError: # for year 2003, there's no <a> tag rk, title, cnt = row.findChildren() books.append({ 'type': rank_type, 'book_name': title.text.strip(' /'), 'url': ref.get('href') if ref else None, 'rank': rk.text, 'tag': tag, 'year': year, 'circulations': cnt.text }) results[table.get('summary')] = books return results