def insert_document(self, meta_data): try: url = 'http://gk.canpoint.cn/FileDetail-%s.html' % meta_data[0] html = unicode(urllib2.urlopen(url).read(), 'utf-8') d = pq(html) res_title, res_point = meta_data[1], meta_data[2] (res_province, res_city, res_county) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction(res_title, '', '') description = d('.tb_down tr').eq(2)('td').eq(1).text().split() res_subject = description[0] res_date = dateformat.format_date(d('.tb_down tr').eq(1).find('td').eq(3).text().strip()) res_intro = d('div.des_down').html().split('<br />')[-1].strip() crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S") #TODO res_version, res_type res_version, res_type = '', '' document = {'res_title': res_title, 'res_url': url, 'res_date': res_date, 'res_downcount': meta_data[3], 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_grade': u'高中', 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id': meta_data[0], 'res_file': '', 'site_id': 'canpoint', 'date': crawl_date} if (res_point <= 10) and (res_point >= 0): self.download_urls[res_point].append(meta_data[0]) print res_title, url #self.canpoint.insert_one(document) except Exception, e: print e logging.info('error at {}, reason {}'.format(meta_data[0], traceback.format_exc()))
def gk_collect(self): ''' collects from http://gk.canpoint.cn/Search.aspx METHOD - traversal through tabs ''' print 'at gk.canpoint' driver = webdriver.PhantomJS() driver.get('http://gk.canpoint.cn/Search.aspx') start_time = datetime.today() scount = self.collection.count() res_area, res_grade, res_intro, res_class, res_version = '', '高中', '', '', '' for i in range(1, 10): subject_tabs = driver.find_element_by_id('k' + str(i)) res_subject = subject_tabs.text subject_tabs.click() for j in range(2, 12): type_tabs = driver.find_element_by_xpath( "//li[@id='tongbu']/a[" + str(j) + "]") type_tabs.click() res_type = type_tabs.text EXIT = False MAX_PAGE = int( driver.find_element_by_xpath( "//span[@class='page']/span[2]").text) if MAX_PAGE > 3: MAX_PAGE = 3 else: MAX_PAGE -= 1 while MAX_PAGE > 0: for k in range(1, 21): try: Title = driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[@name='sTabletr'][" + str(k) + "]//a") res_title = Title.text res_url = Title.get_attribute("href") res_point = int( driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[@name='sTabletr'][" + str(k) + "]/td[3]").text) res_date = dateformat.format_date( driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[@name='sTabletr'][" + str(k) + "]/td[4]").text) res_downcount = int( driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[@name='sTabletr'][" + str(k) + "]/td[5]").text) res_id = res_url[33:-5] (res_province, res_city, res_county ) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction( res_title, res_grade, res_class) res_intro = self.get_intro(res_url, gk=True) document = { 'res_title': res_title, 'res_url': res_url, 'res_date': res_date, 'res_downcount': res_downcount, 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_area': res_area, 'res_grade': res_grade, 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id': res_id, 'res_file': '', 'site_id': 'canpoint' } print res_url, res_date if self.check_date(res_date, datetime.today()): print 'document too old' EXIT = True break if res_url in self.res_urls: EXIT = True print 'document already exists' break self.res_urls.add(res_url) self.document_list.append(document) except Exception: traceback.print_exc() if EXIT: break MAX_PAGE -= 1 driver.find_element_by_xpath( "//div[@class='kuang_b']/span[@class='page']/a[3]" ).click() self.wait_for_load(driver) if len(self.document_list) > 0: self.collection.insert_many(self.document_list) self.collection_res.insert_many(self.document_list) end_time = datetime.today() s = '{}, FOUND {} NEW ITEMS. PROCESSING TIME: {}'.format( end_time, self.collection.count() - scount, end_time - start_time) print s logging.info(s) self.document_list = [] driver.quit()
def collect(self): ''' collects from http://www.xiangpi.com/zujuan/3/1-0-0-0-0/ METHOD - traversal through tabs ''' scount = self.collection.count() start_time = datetime.today() res_type, res_version, res_intro, res_point = '', '', '', '' driver = webdriver.PhantomJS() driver.get('http://www.xiangpi.com/zujuan/3/1-0-0-0-0/') for i in range(1, 4): grade_tabs = driver.find_element_by_xpath( "//div[@id='tabEduLevels']/a[" + str(i) + "]") res_grade = grade_tabs.text ActionChains(driver).move_to_element(grade_tabs).click().perform() time.sleep(1) if i == 1: #Highschool subjects = 9 if i == 2: #junior high subjects = 8 if i == 3: #Elementary subjects = 2 for j in range(1, subjects + 1): subject_tabs = driver.find_element_by_xpath( "//div[@id='tabChapterSubjects']/a[" + str(j) + "]") res_subject = subject_tabs.text ActionChains(driver).move_to_element( subject_tabs).click().perform() EXIT = False time.sleep(1) MAX_PAGE = 0 article_count = int( driver.find_element_by_id('paperTotal').text) if article_count != 0: MAX_PAGE = article_count / 15 + 1 if MAX_PAGE > 2: MAX_PAGE = 2 for l in range(2, MAX_PAGE + 2): pages = driver.find_element_by_xpath( "//div[@id='paperListPage']/a[" + str(l) + "]") ActionChains(driver).move_to_element( pages).click().perform() time.sleep(1) for m in range(1, 16): try: Title = driver.find_element_by_xpath( "//ul[@id='paperListCotent']/li[" + str(m) + "]/p[2]/a") res_title = Title.text res_class = driver.find_element_by_xpath( "//ul[@id='paperListCotent']/li[" + str(m) + "]/p[3]/a[1]").text res_downcount = int( driver.find_element_by_xpath( "//ul[@id='paperListCotent']/li[" + str(m) + "]/p[5]").text) res_date = dateformat.format_date( driver.find_element_by_xpath( "//ul[@id='paperListCotent']/li[" + str(m) + "]/p[6]").text) res_area = driver.find_element_by_xpath( "//ul[@id='paperListCotent']/li[" + str(m) + "]/p[3]/a[3]").text res_url = Title.get_attribute("href") (res_province, res_city, res_county ) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction( res_title, res_grade, res_class) res_id = int(filter(str.isdigit, str(res_url))) document = { 'res_title': res_title, 'res_url': res_url, 'res_date': res_date, 'res_downcount': res_downcount, 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_area': res_area, 'res_grade': res_grade, 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id': res_id, 'res_file': '', 'site_id': 'xiangpi' } print res_url, res_date if self.check_date(res_date, datetime.today()): print 'document too old' EXIT = True break if self.collection.find_one({'res_url': res_url}): EXIT = True print 'document already exists' break self.document_list.append(document) except Exception: traceback.print_exc() if EXIT: break if len(self.document_list) > 0: self.collection.insert_many(self.document_list) self.collection_res.insert_many(self.document_list) end_time = datetime.today() s = '{}, FOUND {} NEW ITEMS. PROCESSING TIME: {}'.format( end_time, self.collection.count() - scount, end_time - start_time) print s logging.info(s) driver.quit()
def zk_collect(self): ''' collects from http://zk.canpoint.cn/ METHOD - traversal through clicking tabs ''' print 'at zk.canpoint' driver = webdriver.PhantomJS() driver.get('http://zk.canpoint.cn/') start_time = datetime.today() scount = self.collection.count() res_area, res_grade, res_intro, res_class, res_version = '', '初中', '', '', '' for i in range(2, 11): subject_tabs = driver.find_element_by_xpath( "//li[@id='xueke']/a[" + str(i) + "]") res_subject = subject_tabs.text ActionChains(driver).move_to_element(subject_tabs).click( subject_tabs).perform() self.wait_for_load(driver) for j in range(2, 8): type_tabs = driver.find_element_by_xpath( "//div[@class='bg']/div[@class='menu_right']/ul[@class='nav_a']/li[@id='tongbu']/a[" + str(j) + "]") ActionChains(driver).move_to_element( type_tabs).click().perform() self.wait_for_load(driver) res_type = type_tabs.text MAX_PAGE = int( driver.find_element_by_xpath( "//div[@class='kuang_b']/span[@class='page']/span[2]"). text) EXIT = False if MAX_PAGE > 3: MAX_PAGE = 3 else: MAX_PAGE -= 1 while MAX_PAGE > 0: for k in range(1, 61, 3): try: Title = driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[" + str(k) + "]/td[1]/a") res_title = re.sub( r"<.*>", "", Title.get_attribute('innerHTML')).strip() res_url = Title.get_attribute("href") res_point = int( driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[" + str(k) + "]/td[3]").text) res_date = dateformat.format_date( driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[" + str(k) + "]/td[4]").text) res_downcount = int( driver.find_element_by_xpath( "//table[@id='searchTable']/tbody/tr[" + str(k) + "]/td[5]").text) res_id = res_url[33:-5] (res_province, res_city, res_county ) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction( res_title, res_grade, res_class) res_intro = self.get_intro(res_url, gk=False) document = { 'res_title': res_title, 'res_url': res_url, 'res_date': res_date, 'res_downcount': res_downcount, 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_area': res_area, 'res_grade': res_grade, 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id': res_id, 'res_file': '', 'site_id': 'canpoint' } print res_url, res_date if self.check_date(res_date, datetime.today()): print 'document too old' EXIT = True break if res_url in self.res_urls: EXIT = True print 'document already exists' break self.res_urls.add(res_url) self.document_list.append(document) except Exception: traceback.print_exc() if EXIT: break MAX_PAGE -= 1 nextpage = driver.find_element_by_xpath( "//div[@class='kuang_b']/span[@class='page']/a[3]") ActionChains(driver).move_to_element( nextpage).click().perform() time.sleep(1) if len(self.document_list) > 0: self.collection.insert_many(self.document_list) self.collection_res.insert_many(self.document_list) end_time = datetime.today() s = '{}, FOUND {} NEW ITEMS. PROCESSING TIME: {}'.format( end_time, self.collection.count() - scount, end_time - start_time) print s logging.info(s) self.document_list = [] driver.quit()
def get_info(self, res_id, DATA): ''' Reads all relevant data from selected download page ''' try: d = pq(DATA) res_intro, res_type, res_grade, res_class = '', '', '', '' res_title = d('h3.paper-title').text() table = d('div.c').find('p') res_point = float(table.eq(7).find('span a').text()) res_version = table.eq(1).find('span').text() res_downcount = int(table.eq(4).find('span').text()[:-1]) res_date = dateformat.format_date(table.eq(3).find('span').text()) res_type_o = table.eq(5).find('span').text() for k in self.type_dict.keys(): if res_type_o in self.type_dict[k]: res_type = k break rs_1 = table.eq(0).find('span a').eq(0).text() rs_2 = table.eq(0).find('span a').eq(1).text() if rs_1 in self.subject_set: res_subject = rs_1 elif rs_2 in self.subject_set: res_subject = rs_2 else: res_subject = '' (res_province, res_city, res_county) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction(res_title, res_grade, res_class) res_url = 'http://www.daliankao.org/down/' + str(res_id) + '.html' crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S") document = { 'res_title': res_title, 'res_url': res_url, 'res_date': res_date, 'res_downcount': res_downcount, 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_type_o': res_type_o, 'res_grade': res_grade, 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id': res_id, 'res_file': '', 'site_id': 'daliankao', 'date': crawl_date } print res_url, res_date if res_url not in self.res_urls: self.collection.insert_one(document) self.collection_res.insert_one(document) else: print 'document exists' except Exception: tb = traceback.format_exc() logging.info('getting {} failed. Reason: {}'.format(res_id, tb)) print tb
def get_info(self, html, res_url, res_id): ''' Reads all relevant data from selected download page ''' try: d = pq(html) res_downcount = '' class_subject = d('.xq_goo ul li').eq(3).find( 'span a').text().split() res_class = class_subject[0] res_subject = class_subject[1] res_title = d('.xq_h.xq_sdhahdwq ul li a').text() res_date = dateformat.format_date( d('.xq_goo ul li').eq(0).text().split()[1]) res_point = d('.xq_goo ul li').eq(4).find('span font').text() res_type = d('.xq_goo ul li').eq(2).find('span a').eq(0).text() res_version = d('.xq_goo ul li').eq(1).find('span a').eq(0).text() res_intro = d('#contenthuidai').text() crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S") try: res_point = int(filter(unicode.isdigit, res_point)) except: pass (res_province, res_city, res_county) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction(res_title, '', res_class) document = { 'res_title': res_title, 'res_url': res_url, 'res_date': res_date, 'res_downcount': res_downcount, 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_grade': res_grade, 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id': res_id, 'res_file': '', 'site_id': 'dearedu', 'date': crawl_date } print res_url, res_date if res_url not in self.res_urls: self.collection.insert_one(document) self.collection_res.insert_one(document) else: print 'document exists' except Exception, e: tb = traceback.format_exc() logging.info('getting {} failed. Reason: {}'.format(res_url, tb)) print tb
def get_info(self, res_id, DATA): ''' Reads all relevant data from selected download page ''' try: d = pq(DATA) res_grade = '', '' res_title = d('div.nexttopR_head h1.ell').text() res_url = 'http://taoti.tl100.com/detail-' + str(res_id) + '.html' res_point = int(d('#point strong').text()) table1 = d('#sx tr') res_class = table1('td').eq(1).text() res_intro = d('div.contentbox table td p').text() res_subject = table1('td').eq(3).text() res_version = table1('td').eq(5).text() res_type = table1('td').eq(7).text() table2 = d('div.title2 tr td').eq(0).text().split('|') res_date = dateformat.format_date(table2[1][6:]) rp = table2[2][5:] (res_province, res_city, res_county) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction( res_title, res_grade, res_class, ) res_province = rp res_downcount = int(d('#hits').text()) crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S") document = { 'res_title': res_title, 'res_url': res_url, 'res_date': res_date, 'res_downcount': res_downcount, 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_grade': res_grade, 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id': res_id, 'res_file': '', 'site_id': 'tl100', 'date': crawl_date } print res_url, res_date if res_url not in self.res_urls: self.collection.insert_one(document) self.collection_res.insert_one(document) else: print 'document exists' except Exception: tb = traceback.format_exc() logging.info('getting {} failed. Reason: {}'.format(res_id, tb)) print tb
def execute(self, grades): ''' Reads all relevant data from a given document list. Each process is tasked with a grade ''' document_list = [] client = pymongo.MongoClient(conf.mongo.ip, conf.mongo.port) collection = client.spider.dearedu collection_res = client.spider.res res_intro, res_downcount = '', '' res_grade = grades.text() grade_url = grades.find('a').attr('href') PAGE_COUNT = 1 EXIT = False while PAGE_COUNT <= 100: page_extension = '&p=' + str(PAGE_COUNT) pq_page = pq(self.read_url(grade_url + page_extension)) for i in range(0, 10): try: articles = pq_page( '.z_right .lb_zs .lb_grey .lb_aload .lb_bleft').eq(i) tmp = articles('span').text() order_text = tmp[4:] res_title = articles('h1').find('a').text() res_url = articles('h1').find('a').attr('href') date = articles('p').eq(0).text() area = articles('p').eq(1).text() point = articles('p').eq(3).text() res_date = dateformat.format_date(date[3:]) res_area = area[3:] res_point = point[3:] if res_point != u'免费': res_point = int(res_point) (res_subject, res_type, res_class) = self.extract_info(order_text) res_id = int(filter(str.isdigit, str(res_url))) (res_province, res_city, res_county) = self.pcc_extractor.extraction(res_title) (res_grade, res_class) = self.gc_extractor.extraction( res_title, res_grade, res_class) res_version = self.res_version(res_title, res_grade) document = { 'res_title': res_title, 'res_url': res_url, 'res_date': res_date, 'res_downcount': res_downcount, 'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_area': res_area, 'res_grade': res_grade, 'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version, 'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_file': '', 'site_id': 'dearedu', 'res_id': res_id } print res_url, res_date if self.check_date(res_date, datetime.today()): print 'document too old' EXIT = True break if collection.find_one({'res_url': res_url}): EXIT = True print 'document already exists' break document_list.append(document) except Exception: traceback.print_exc() if EXIT: break PAGE_COUNT += 1 if len(document_list) > 0: collection.insert_many(document_list) collection_res.insert_many(document_list) client.close()