c_content = 'div.postbody' c_author = 'b.postauthor' site = 'GT' regex = '\&t=(\d+)' while(processing): print "going to Auto parts Selling..." (selling_link, ) = html('a[href*="./viewforum.php?f=8"]').map(lambda i, e: pq(e).attr('href')) req = br.find_link(url=selling_link) res = br.follow_link(req) if page is 1: listings_html = pq(res.read()) sales_urls = listings_html('td.row1 > img[src*="topic"]').parents('td.row1').siblings('td.row1 > a.topictitle').\ map(lambda i, e: br.find_link(url=pq(e).attr('href'))) crawler(sales_urls, mecha_state=br, content=c_content, author=c_author, post_regex=regex, site_id=site, reform_url=True) page += 1 br.back() else: next_page_url = "http://grupotoyota.com.ph/board/viewforum.php?f=8&start=%s" % (nxt_pge_cnt) print "scraping page %s" % (next_page_url) print "Page Count at %s" % (nxt_pge_cnt) res_pg_2 = br.open(next_page_url) listings_2 = pq(res_pg_2.read()) sales_urls = listings_2('td.row1 > img[src*="topic"]').parents('td.row1').siblings('td.row1 > a.topictitle').\ map(lambda i, e: br.find_link(url=pq(e).attr('href'))) crawler(sales_urls, mecha_state=br, content=c_content, author=c_author, post_regex=regex, site_id=site, reform_url=True) nxt_pge_cnt += 25 if(nxt_pge_cnt == 150): processing = False
def main(): url = "http://grupotoyota.com.ph/board/" br = mechanize.Browser(factory=mechanize.RobustFactory()) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.set_handle_robots(False) print "entering GT site..." print "logging into site..." br.open(url) br.select_form(nr=0) br['username'] = '******' br['password'] = '******' br.submit() print "login in successful!" html = pq(br.response().read()) processing = True post_content = 'div.postbody' post_author = 'b.postauthor' site_id = 'GT' regex = '\&t=(\d+)' orig_post_date = 'td.gensmall > div:nth-child(2)' edited_post_date = None date_regex = '([A-Za-z]{3}\s[0-9,]{1,2}[,]\s[0-9]{4})' page = 1 nxt_pge_cnt = 25 end_pge_cnt = 150 while(processing): print "going to Auto parts Selling..." if page is 1: (selling_link ,)= html('a[href*="./viewforum.php?f=8"]').map(lambda i, e: pq(e).attr('href')) req = br.find_link(url=selling_link) res = br.follow_link(req) print "Auto Parts - Selling (Car Stuff, Parts, Accessories)... Url: %s" % (res.geturl()) print "scraping page 1" listings_html = pq(res.read()) storage_list = listings_html('td.row1 > img[src*="topic"]').parents('td.row1').siblings('td.row1 > a.topictitle').\ map(lambda i, e: br.find_link(url=pq(e).attr('href'))) pd = PageData(storage_list, br)\ .add_content(post_content, post_author, regex)\ .post_date(orig_post_date, edited_post_date, date_regex)\ .with_site_id(site_id).if_reform_url(True) crawler(pd) page += 1 #log_message('gt-driver_%d' % (page)) br.back() else: next_page_url = "http://grupotoyota.com.ph/board/viewforum.php?f=8&start=%s" % (nxt_pge_cnt) print "scraping page %s" % (next_page_url) print "Page Count at %s" % (nxt_pge_cnt) res_pg_2 = br.open(next_page_url) listings_2 = pq(res_pg_2.read()) storage_list = listings_2('td.row1 > img[src*="topic"]').parents('td.row1').siblings('td.row1 > a.topictitle').\ map(lambda i, e: br.find_link(url=pq(e).attr('href'))) pd = PageData(storage_list, br)\ .add_content(post_content, post_author, regex)\ .post_date(orig_post_date, edited_post_date, date_regex)\ .with_site_id(site_id).if_reform_url(True) crawler(pd) nxt_pge_cnt += 25 if(nxt_pge_cnt == end_pge_cnt): break #processing = False #log_message('gt-driver_%d' % nxt_pge_cnt) br.back()
def main(): url = 'http://www.mitsulancerph.net/yabb2/YaBB.pl' br = mechanize.Browser(factory=mechanize.RobustFactory()) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.set_handle_robots(False) print "entering MLPH site..." print "logging into site..." br.open(url) br.select_form(nr=0) br['username'] = '******' br['passwrd'] = 'p455w0rd' br.submit() print "login in successful!" processing = True page = 1 post_content = 'div.message:first' post_author = 'a[href*="username"]:first' site_id = 'MLPH' regex = 'num=(\d+)' edited_post_date = 'i:first' date_regex = "(Yesterday|Today|[0-9]{1,2}/[0-9]{1,2}/[0-9]{1,2})" nxt_pge_cnt = 20 end_pge_cnt = 80 def determine_encode(tex): if type(tex) is unicode: return br.find_link(text_regex=tex) return br.find_link(text=tex) while(processing): print "Going Car Related posts..." if page is 1: req = br.click_link(text="Car Related") res = br.open(req) print "Car Related Url : %s" % (res.geturl()) print "scraping page 1" listings = pq(res.read()) lists = listings('td.windowbg > div > b > a').map( lambda i, e: pq(e).text() if not re.compile('(DISCLAIMER|CHECK THE)').findall(pq(e).text()) else None ) storage_list = [determine_encode(link) for link in lists] text_dates = listings('span.small > a[href*=".pl?num="]').text() dates = re.compile(date_regex).findall(text_dates) dates[1:3] = [] store = [] for idx, i in enumerate(storage_list): i.post_date = dates[idx] store.append(i) pd = PageData(store, br)\ .add_content(post_content, post_author, regex)\ .post_date(None, edited_post_date, date_regex)\ .with_site_id(site_id).if_reform_url(False) crawler(pd) page += 1 br.back() else: next_page_url = "http://www.mitsulancerph.net/yabb2/YaBB.pl?board=Caritems/%s" % (nxt_pge_cnt) print "Page Count at %s" % (nxt_pge_cnt) print "scraping page %s" % (next_page_url) res_pg_2 = br.open(next_page_url) listings = pq(res_pg_2.read()) lists = listings('td.windowbg > div > b > a').map( lambda i, e: pq(e).text() if not re.compile('(DISCLAIMER|CHECK THE)').findall(pq(e).text()) else None ) storage_list = [determine_encode(link) for link in lists] text_dates = listings('span.small > a[href*=".pl?num="]').text() dates = re.compile(date_regex).findall(text_dates) store = [] for idx, i in enumerate(storage_list): i.post_date = dates[idx] store.append(i) pd = PageData(store, br)\ .add_content(post_content, post_author, regex)\ .post_date(None, edited_post_date, date_regex)\ .with_site_id(site_id).if_reform_url(False) crawler(pd) nxt_pge_cnt += 20 br.back() if(nxt_pge_cnt == end_pge_cnt): processing = False
nxt_pge_cnt = 40 end_pge_cnt = 80 while(processing): print "going to Underground Parts" if page is 1: req = br.click_link(text='Underground Parts') res = br.open(req) print "Underground Parts Url : %s" % (res.geturl()) print "scraping page 1" listings = pq(res.read()) lists = listings('td.darkrow1').eq(5).parents('tr').siblings('tr').children('td > a[href*="showtopic"]').not_('.linkthru') storage_list = lists.map(lambda i, e: br.find_link(text=pq(e).text().replace(" ", " "))) #test_crawler(storage_list, mecha_state=br, content=post_content, author=post_author, post_regex=regex, site_id=site, reform_url=False) crawler(storage_list, mecha_state=br, content=post_content, author=post_author, post_regex=regex, site_id=site, reform_url=False) page += 1 br.back() else: next_page_url = "http://z11.invisionfree.com/JDM_Underground/index.php?showforum=3&prune_day=100&sort_by=Z-A&sort_key=last_post&st=%s" % (nxt_pge_cnt) print "Page Count at %s" % (nxt_pge_cnt) print "scraping page %s" % (next_page_url) res_pg_2 = br.open(next_page_url) listings_2 = pq(res_pg_2.read()) storage_list_2 = listings_2('td.row4 > a[href*="showtopic"]').map(lambda i, e: br.find_link(url=pq(e).attr('href'))) #test_crawler(storage_list_2, mecha_state=br, content=post_content, author=post_author, post_regex=regex, site_id=site, reform_url=False) crawler(storage_list_2, mecha_state=br, content=post_content, author=post_author, post_regex=regex, site_id=site, reform_url=False) nxt_pge_cnt += 40 br.back() if(nxt_pge_cnt == end_pge_cnt):
def main(): url = 'http://s3.zetaboards.com/HCP/site/' br = mechanize.Browser(factory=mechanize.RobustFactory()) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.set_handle_robots(False) print "entering hcp site..." print "logging into site..." br.open(url) br.select_form(nr=0) br['uname'] = 'jamongkad' br['pw'] = 'password' br.submit() processing = True post_content = 'td.c_post' post_author = 'td.c_username' site_id = 'HCP' regex = 'HCP\/topic\/(\d+)/' orig_post_date = 'span.left' edited_post_date = 'div.editby' date_regex = '([A-Za-z]{3}\s[0-9]{1,2}\s[0-9]{4})' nxt_pge_cnt = 2 end_pge_cnt = 5 page = 1 while(processing): print "going to Parts & Accessories" if page is 1: req = br.click_link(text='Parts & Accessories') res = br.open(req) print "scraping page 1" listings = pq(res.read()) storage_list = listings('td.c_cat-title > a[href*="topic"]').map(lambda i, e: br.find_link(url=pq(e).attr('href'))) pd = PageData(storage_list, br)\ .add_content(post_content, post_author, regex)\ .post_date(orig_post_date, edited_post_date, date_regex)\ .with_site_id(site_id).if_reform_url(False) crawler(pd) page += 1 log_message('hcp-driver_%d' % (page)) br.back() else: next_page_url = "http://s3.zetaboards.com/HCP/forum/21305/%s" % (nxt_pge_cnt) print "scraping page %s" % (next_page_url) res_pg_2 = br.open(next_page_url) listings = pq(res_pg_2.read()) storage_list = listings('td.c_cat-title > a[href*="topic"]').map(lambda i, e: br.find_link(url=pq(e).attr('href'))) pd = PageData(storage_list, br)\ .add_content(post_content, post_author, regex)\ .post_date(orig_post_date, edited_post_date, date_regex)\ .with_site_id(site_id).if_reform_url(False) crawler(pd) nxt_pge_cnt += 1 log_message('hcp-driver_%d' % (nxt_pge_cnt)) br.back() if(nxt_pge_cnt == end_pge_cnt): processing = False