Пример #1
0
    def insert_document(self, meta_data):

        try:
            url = 'http://gk.canpoint.cn/FileDetail-%s.html' % meta_data[0]
            html = unicode(urllib2.urlopen(url).read(), 'utf-8')
            d = pq(html)
            res_title, res_point = meta_data[1], meta_data[2]
            (res_province, res_city, res_county) = self.pcc_extractor.extraction(res_title)
            (res_grade, res_class) = self.gc_extractor.extraction(res_title, '', '')
            description = d('.tb_down tr').eq(2)('td').eq(1).text().split()
            res_subject = description[0]
            res_date = dateformat.format_date(d('.tb_down tr').eq(1).find('td').eq(3).text().strip())
            res_intro = d('div.des_down').html().split('<br />')[-1].strip()
            crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S")
            #TODO res_version, res_type
            res_version, res_type = '', ''

            document = {'res_title': res_title, 'res_url': url, 'res_date': res_date, 'res_downcount': meta_data[3],
                        'res_point': res_point, 'res_subject': res_subject, 'res_type': res_type, 'res_grade': u'高中',
                        'res_intro': res_intro, 'res_class': res_class, 'res_version': res_version,
                        'res_province': res_province, 'res_city': res_city, 'res_county': res_county, 'res_id':  meta_data[0],
                        'res_file': '', 'site_id': 'canpoint', 'date': crawl_date}
            if (res_point <= 10) and (res_point >= 0):
                self.download_urls[res_point].append(meta_data[0])
            print res_title, url
            #self.canpoint.insert_one(document)

        except Exception, e:
            print e
            logging.info('error at {}, reason {}'.format(meta_data[0], traceback.format_exc()))
Пример #2
0
    def gk_collect(self):
        '''
        collects from http://gk.canpoint.cn/Search.aspx
        METHOD - traversal through tabs
        '''

        print 'at gk.canpoint'
        driver = webdriver.PhantomJS()
        driver.get('http://gk.canpoint.cn/Search.aspx')
        start_time = datetime.today()
        scount = self.collection.count()
        res_area, res_grade, res_intro, res_class, res_version = '', '高中', '', '', ''

        for i in range(1, 10):
            subject_tabs = driver.find_element_by_id('k' + str(i))
            res_subject = subject_tabs.text
            subject_tabs.click()
            for j in range(2, 12):
                type_tabs = driver.find_element_by_xpath(
                    "//li[@id='tongbu']/a[" + str(j) + "]")
                type_tabs.click()
                res_type = type_tabs.text
                EXIT = False

                MAX_PAGE = int(
                    driver.find_element_by_xpath(
                        "//span[@class='page']/span[2]").text)
                if MAX_PAGE > 3:
                    MAX_PAGE = 3
                else:
                    MAX_PAGE -= 1

                while MAX_PAGE > 0:
                    for k in range(1, 21):
                        try:
                            Title = driver.find_element_by_xpath(
                                "//table[@id='searchTable']/tbody/tr[@name='sTabletr']["
                                + str(k) + "]//a")
                            res_title = Title.text
                            res_url = Title.get_attribute("href")
                            res_point = int(
                                driver.find_element_by_xpath(
                                    "//table[@id='searchTable']/tbody/tr[@name='sTabletr']["
                                    + str(k) + "]/td[3]").text)
                            res_date = dateformat.format_date(
                                driver.find_element_by_xpath(
                                    "//table[@id='searchTable']/tbody/tr[@name='sTabletr']["
                                    + str(k) + "]/td[4]").text)
                            res_downcount = int(
                                driver.find_element_by_xpath(
                                    "//table[@id='searchTable']/tbody/tr[@name='sTabletr']["
                                    + str(k) + "]/td[5]").text)
                            res_id = res_url[33:-5]
                            (res_province, res_city, res_county
                             ) = self.pcc_extractor.extraction(res_title)
                            (res_grade,
                             res_class) = self.gc_extractor.extraction(
                                 res_title, res_grade, res_class)
                            res_intro = self.get_intro(res_url, gk=True)

                            document = {
                                'res_title': res_title,
                                'res_url': res_url,
                                'res_date': res_date,
                                'res_downcount': res_downcount,
                                'res_point': res_point,
                                'res_subject': res_subject,
                                'res_type': res_type,
                                'res_area': res_area,
                                'res_grade': res_grade,
                                'res_intro': res_intro,
                                'res_class': res_class,
                                'res_version': res_version,
                                'res_province': res_province,
                                'res_city': res_city,
                                'res_county': res_county,
                                'res_id': res_id,
                                'res_file': '',
                                'site_id': 'canpoint'
                            }

                            print res_url, res_date
                            if self.check_date(res_date, datetime.today()):
                                print 'document too old'
                                EXIT = True
                                break
                            if res_url in self.res_urls:
                                EXIT = True
                                print 'document already exists'
                                break
                            self.res_urls.add(res_url)
                            self.document_list.append(document)
                        except Exception:
                            traceback.print_exc()
                    if EXIT:
                        break
                    MAX_PAGE -= 1
                    driver.find_element_by_xpath(
                        "//div[@class='kuang_b']/span[@class='page']/a[3]"
                    ).click()
                    self.wait_for_load(driver)

        if len(self.document_list) > 0:
            self.collection.insert_many(self.document_list)
            self.collection_res.insert_many(self.document_list)

        end_time = datetime.today()
        s = '{}, FOUND {} NEW ITEMS. PROCESSING TIME: {}'.format(
            end_time,
            self.collection.count() - scount, end_time - start_time)
        print s
        logging.info(s)
        self.document_list = []
        driver.quit()
Пример #3
0
    def collect(self):
        '''
        collects from http://www.xiangpi.com/zujuan/3/1-0-0-0-0/
        METHOD - traversal through tabs
        '''

        scount = self.collection.count()
        start_time = datetime.today()
        res_type, res_version, res_intro, res_point = '', '', '', ''
        driver = webdriver.PhantomJS()
        driver.get('http://www.xiangpi.com/zujuan/3/1-0-0-0-0/')
        for i in range(1, 4):
            grade_tabs = driver.find_element_by_xpath(
                "//div[@id='tabEduLevels']/a[" + str(i) + "]")
            res_grade = grade_tabs.text
            ActionChains(driver).move_to_element(grade_tabs).click().perform()
            time.sleep(1)
            if i == 1:  #Highschool
                subjects = 9
            if i == 2:  #junior high
                subjects = 8
            if i == 3:  #Elementary
                subjects = 2
            for j in range(1, subjects + 1):
                subject_tabs = driver.find_element_by_xpath(
                    "//div[@id='tabChapterSubjects']/a[" + str(j) + "]")
                res_subject = subject_tabs.text
                ActionChains(driver).move_to_element(
                    subject_tabs).click().perform()
                EXIT = False
                time.sleep(1)

                MAX_PAGE = 0
                article_count = int(
                    driver.find_element_by_id('paperTotal').text)
                if article_count != 0:
                    MAX_PAGE = article_count / 15 + 1
                    if MAX_PAGE > 2:
                        MAX_PAGE = 2

                for l in range(2, MAX_PAGE + 2):
                    pages = driver.find_element_by_xpath(
                        "//div[@id='paperListPage']/a[" + str(l) + "]")
                    ActionChains(driver).move_to_element(
                        pages).click().perform()
                    time.sleep(1)
                    for m in range(1, 16):
                        try:
                            Title = driver.find_element_by_xpath(
                                "//ul[@id='paperListCotent']/li[" + str(m) +
                                "]/p[2]/a")
                            res_title = Title.text
                            res_class = driver.find_element_by_xpath(
                                "//ul[@id='paperListCotent']/li[" + str(m) +
                                "]/p[3]/a[1]").text
                            res_downcount = int(
                                driver.find_element_by_xpath(
                                    "//ul[@id='paperListCotent']/li[" +
                                    str(m) + "]/p[5]").text)
                            res_date = dateformat.format_date(
                                driver.find_element_by_xpath(
                                    "//ul[@id='paperListCotent']/li[" +
                                    str(m) + "]/p[6]").text)
                            res_area = driver.find_element_by_xpath(
                                "//ul[@id='paperListCotent']/li[" + str(m) +
                                "]/p[3]/a[3]").text
                            res_url = Title.get_attribute("href")
                            (res_province, res_city, res_county
                             ) = self.pcc_extractor.extraction(res_title)
                            (res_grade,
                             res_class) = self.gc_extractor.extraction(
                                 res_title, res_grade, res_class)
                            res_id = int(filter(str.isdigit, str(res_url)))

                            document = {
                                'res_title': res_title,
                                'res_url': res_url,
                                'res_date': res_date,
                                'res_downcount': res_downcount,
                                'res_point': res_point,
                                'res_subject': res_subject,
                                'res_type': res_type,
                                'res_area': res_area,
                                'res_grade': res_grade,
                                'res_intro': res_intro,
                                'res_class': res_class,
                                'res_version': res_version,
                                'res_province': res_province,
                                'res_city': res_city,
                                'res_county': res_county,
                                'res_id': res_id,
                                'res_file': '',
                                'site_id': 'xiangpi'
                            }

                            print res_url, res_date
                            if self.check_date(res_date, datetime.today()):
                                print 'document too old'
                                EXIT = True
                                break
                            if self.collection.find_one({'res_url': res_url}):
                                EXIT = True
                                print 'document already exists'
                                break
                            self.document_list.append(document)
                        except Exception:
                            traceback.print_exc()
                    if EXIT:
                        break

        if len(self.document_list) > 0:
            self.collection.insert_many(self.document_list)
            self.collection_res.insert_many(self.document_list)

        end_time = datetime.today()
        s = '{}, FOUND {} NEW ITEMS. PROCESSING TIME: {}'.format(
            end_time,
            self.collection.count() - scount, end_time - start_time)
        print s
        logging.info(s)
        driver.quit()
Пример #4
0
    def zk_collect(self):
        '''
        collects from http://zk.canpoint.cn/
        METHOD - traversal through clicking tabs
        '''

        print 'at zk.canpoint'
        driver = webdriver.PhantomJS()
        driver.get('http://zk.canpoint.cn/')
        start_time = datetime.today()
        scount = self.collection.count()
        res_area, res_grade, res_intro, res_class, res_version = '', '初中', '', '', ''

        for i in range(2, 11):
            subject_tabs = driver.find_element_by_xpath(
                "//li[@id='xueke']/a[" + str(i) + "]")
            res_subject = subject_tabs.text
            ActionChains(driver).move_to_element(subject_tabs).click(
                subject_tabs).perform()
            self.wait_for_load(driver)
            for j in range(2, 8):
                type_tabs = driver.find_element_by_xpath(
                    "//div[@class='bg']/div[@class='menu_right']/ul[@class='nav_a']/li[@id='tongbu']/a["
                    + str(j) + "]")
                ActionChains(driver).move_to_element(
                    type_tabs).click().perform()
                self.wait_for_load(driver)
                res_type = type_tabs.text
                MAX_PAGE = int(
                    driver.find_element_by_xpath(
                        "//div[@class='kuang_b']/span[@class='page']/span[2]").
                    text)
                EXIT = False

                if MAX_PAGE > 3:
                    MAX_PAGE = 3
                else:
                    MAX_PAGE -= 1

                while MAX_PAGE > 0:
                    for k in range(1, 61, 3):
                        try:
                            Title = driver.find_element_by_xpath(
                                "//table[@id='searchTable']/tbody/tr[" +
                                str(k) + "]/td[1]/a")
                            res_title = re.sub(
                                r"<.*>", "",
                                Title.get_attribute('innerHTML')).strip()
                            res_url = Title.get_attribute("href")
                            res_point = int(
                                driver.find_element_by_xpath(
                                    "//table[@id='searchTable']/tbody/tr[" +
                                    str(k) + "]/td[3]").text)
                            res_date = dateformat.format_date(
                                driver.find_element_by_xpath(
                                    "//table[@id='searchTable']/tbody/tr[" +
                                    str(k) + "]/td[4]").text)
                            res_downcount = int(
                                driver.find_element_by_xpath(
                                    "//table[@id='searchTable']/tbody/tr[" +
                                    str(k) + "]/td[5]").text)
                            res_id = res_url[33:-5]
                            (res_province, res_city, res_county
                             ) = self.pcc_extractor.extraction(res_title)
                            (res_grade,
                             res_class) = self.gc_extractor.extraction(
                                 res_title, res_grade, res_class)
                            res_intro = self.get_intro(res_url, gk=False)

                            document = {
                                'res_title': res_title,
                                'res_url': res_url,
                                'res_date': res_date,
                                'res_downcount': res_downcount,
                                'res_point': res_point,
                                'res_subject': res_subject,
                                'res_type': res_type,
                                'res_area': res_area,
                                'res_grade': res_grade,
                                'res_intro': res_intro,
                                'res_class': res_class,
                                'res_version': res_version,
                                'res_province': res_province,
                                'res_city': res_city,
                                'res_county': res_county,
                                'res_id': res_id,
                                'res_file': '',
                                'site_id': 'canpoint'
                            }

                            print res_url, res_date
                            if self.check_date(res_date, datetime.today()):
                                print 'document too old'
                                EXIT = True
                                break
                            if res_url in self.res_urls:
                                EXIT = True
                                print 'document already exists'
                                break
                            self.res_urls.add(res_url)
                            self.document_list.append(document)
                        except Exception:
                            traceback.print_exc()
                    if EXIT:
                        break

                    MAX_PAGE -= 1
                    nextpage = driver.find_element_by_xpath(
                        "//div[@class='kuang_b']/span[@class='page']/a[3]")
                    ActionChains(driver).move_to_element(
                        nextpage).click().perform()
                time.sleep(1)

        if len(self.document_list) > 0:
            self.collection.insert_many(self.document_list)
            self.collection_res.insert_many(self.document_list)

        end_time = datetime.today()
        s = '{}, FOUND {} NEW ITEMS. PROCESSING TIME: {}'.format(
            end_time,
            self.collection.count() - scount, end_time - start_time)
        print s
        logging.info(s)
        self.document_list = []
        driver.quit()
Пример #5
0
    def get_info(self, res_id, DATA):
        '''
        Reads all relevant data from selected download page
        '''

        try:
            d = pq(DATA)
            res_intro, res_type, res_grade, res_class = '', '', '', ''
            res_title = d('h3.paper-title').text()
            table = d('div.c').find('p')
            res_point = float(table.eq(7).find('span a').text())
            res_version = table.eq(1).find('span').text()
            res_downcount = int(table.eq(4).find('span').text()[:-1])
            res_date = dateformat.format_date(table.eq(3).find('span').text())
            res_type_o = table.eq(5).find('span').text()
            for k in self.type_dict.keys():
                if res_type_o in self.type_dict[k]:
                    res_type = k
                    break
            rs_1 = table.eq(0).find('span a').eq(0).text()
            rs_2 = table.eq(0).find('span a').eq(1).text()
            if rs_1 in self.subject_set:
                res_subject = rs_1
            elif rs_2 in self.subject_set:
                res_subject = rs_2
            else:
                res_subject = ''
            (res_province, res_city,
             res_county) = self.pcc_extractor.extraction(res_title)
            (res_grade,
             res_class) = self.gc_extractor.extraction(res_title, res_grade,
                                                       res_class)
            res_url = 'http://www.daliankao.org/down/' + str(res_id) + '.html'
            crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

            document = {
                'res_title': res_title,
                'res_url': res_url,
                'res_date': res_date,
                'res_downcount': res_downcount,
                'res_point': res_point,
                'res_subject': res_subject,
                'res_type': res_type,
                'res_type_o': res_type_o,
                'res_grade': res_grade,
                'res_intro': res_intro,
                'res_class': res_class,
                'res_version': res_version,
                'res_province': res_province,
                'res_city': res_city,
                'res_county': res_county,
                'res_id': res_id,
                'res_file': '',
                'site_id': 'daliankao',
                'date': crawl_date
            }

            print res_url, res_date
            if res_url not in self.res_urls:
                self.collection.insert_one(document)
                self.collection_res.insert_one(document)
            else:
                print 'document exists'
        except Exception:
            tb = traceback.format_exc()
            logging.info('getting {} failed. Reason: {}'.format(res_id, tb))
            print tb
Пример #6
0
    def get_info(self, html, res_url, res_id):
        '''
        Reads all relevant data from selected download page
        '''

        try:
            d = pq(html)
            res_downcount = ''
            class_subject = d('.xq_goo ul li').eq(3).find(
                'span a').text().split()
            res_class = class_subject[0]
            res_subject = class_subject[1]
            res_title = d('.xq_h.xq_sdhahdwq ul li a').text()
            res_date = dateformat.format_date(
                d('.xq_goo ul li').eq(0).text().split()[1])
            res_point = d('.xq_goo ul li').eq(4).find('span font').text()
            res_type = d('.xq_goo ul li').eq(2).find('span a').eq(0).text()
            res_version = d('.xq_goo ul li').eq(1).find('span a').eq(0).text()
            res_intro = d('#contenthuidai').text()
            crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S")
            try:
                res_point = int(filter(unicode.isdigit, res_point))
            except:
                pass
            (res_province, res_city,
             res_county) = self.pcc_extractor.extraction(res_title)
            (res_grade,
             res_class) = self.gc_extractor.extraction(res_title, '',
                                                       res_class)

            document = {
                'res_title': res_title,
                'res_url': res_url,
                'res_date': res_date,
                'res_downcount': res_downcount,
                'res_point': res_point,
                'res_subject': res_subject,
                'res_type': res_type,
                'res_grade': res_grade,
                'res_intro': res_intro,
                'res_class': res_class,
                'res_version': res_version,
                'res_province': res_province,
                'res_city': res_city,
                'res_county': res_county,
                'res_id': res_id,
                'res_file': '',
                'site_id': 'dearedu',
                'date': crawl_date
            }

            print res_url, res_date

            if res_url not in self.res_urls:
                self.collection.insert_one(document)
                self.collection_res.insert_one(document)
            else:
                print 'document exists'
        except Exception, e:
            tb = traceback.format_exc()
            logging.info('getting {} failed. Reason: {}'.format(res_url, tb))
            print tb
Пример #7
0
    def get_info(self, res_id, DATA):
        '''
        Reads all relevant data from selected download page
        '''

        try:
            d = pq(DATA)
            res_grade = '', ''
            res_title = d('div.nexttopR_head h1.ell').text()
            res_url = 'http://taoti.tl100.com/detail-' + str(res_id) + '.html'
            res_point = int(d('#point strong').text())
            table1 = d('#sx tr')
            res_class = table1('td').eq(1).text()
            res_intro = d('div.contentbox table td p').text()
            res_subject = table1('td').eq(3).text()
            res_version = table1('td').eq(5).text()
            res_type = table1('td').eq(7).text()
            table2 = d('div.title2 tr td').eq(0).text().split('|')
            res_date = dateformat.format_date(table2[1][6:])
            rp = table2[2][5:]
            (res_province, res_city,
             res_county) = self.pcc_extractor.extraction(res_title)
            (res_grade, res_class) = self.gc_extractor.extraction(
                res_title,
                res_grade,
                res_class,
            )
            res_province = rp
            res_downcount = int(d('#hits').text())
            crawl_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

            document = {
                'res_title': res_title,
                'res_url': res_url,
                'res_date': res_date,
                'res_downcount': res_downcount,
                'res_point': res_point,
                'res_subject': res_subject,
                'res_type': res_type,
                'res_grade': res_grade,
                'res_intro': res_intro,
                'res_class': res_class,
                'res_version': res_version,
                'res_province': res_province,
                'res_city': res_city,
                'res_county': res_county,
                'res_id': res_id,
                'res_file': '',
                'site_id': 'tl100',
                'date': crawl_date
            }

            print res_url, res_date
            if res_url not in self.res_urls:
                self.collection.insert_one(document)
                self.collection_res.insert_one(document)
            else:
                print 'document exists'
        except Exception:
            tb = traceback.format_exc()
            logging.info('getting {} failed. Reason: {}'.format(res_id, tb))
            print tb
Пример #8
0
    def execute(self, grades):
        '''
        Reads all relevant data from a given document list.
        Each process is tasked with a grade
        '''

        document_list = []
        client = pymongo.MongoClient(conf.mongo.ip, conf.mongo.port)
        collection = client.spider.dearedu
        collection_res = client.spider.res
        res_intro, res_downcount = '', ''
        res_grade = grades.text()
        grade_url = grades.find('a').attr('href')
        PAGE_COUNT = 1
        EXIT = False
        while PAGE_COUNT <= 100:
            page_extension = '&p=' + str(PAGE_COUNT)
            pq_page = pq(self.read_url(grade_url + page_extension))
            for i in range(0, 10):
                try:
                    articles = pq_page(
                        '.z_right .lb_zs .lb_grey .lb_aload .lb_bleft').eq(i)
                    tmp = articles('span').text()
                    order_text = tmp[4:]
                    res_title = articles('h1').find('a').text()
                    res_url = articles('h1').find('a').attr('href')
                    date = articles('p').eq(0).text()
                    area = articles('p').eq(1).text()
                    point = articles('p').eq(3).text()
                    res_date = dateformat.format_date(date[3:])
                    res_area = area[3:]
                    res_point = point[3:]
                    if res_point != u'免费':
                        res_point = int(res_point)
                    (res_subject, res_type,
                     res_class) = self.extract_info(order_text)
                    res_id = int(filter(str.isdigit, str(res_url)))
                    (res_province, res_city,
                     res_county) = self.pcc_extractor.extraction(res_title)
                    (res_grade, res_class) = self.gc_extractor.extraction(
                        res_title, res_grade, res_class)
                    res_version = self.res_version(res_title, res_grade)

                    document = {
                        'res_title': res_title,
                        'res_url': res_url,
                        'res_date': res_date,
                        'res_downcount': res_downcount,
                        'res_point': res_point,
                        'res_subject': res_subject,
                        'res_type': res_type,
                        'res_area': res_area,
                        'res_grade': res_grade,
                        'res_intro': res_intro,
                        'res_class': res_class,
                        'res_version': res_version,
                        'res_province': res_province,
                        'res_city': res_city,
                        'res_county': res_county,
                        'res_file': '',
                        'site_id': 'dearedu',
                        'res_id': res_id
                    }

                    print res_url, res_date
                    if self.check_date(res_date, datetime.today()):
                        print 'document too old'
                        EXIT = True
                        break
                    if collection.find_one({'res_url': res_url}):
                        EXIT = True
                        print 'document already exists'
                        break
                    document_list.append(document)
                except Exception:
                    traceback.print_exc()
            if EXIT:
                break
            PAGE_COUNT += 1
        if len(document_list) > 0:
            collection.insert_many(document_list)
            collection_res.insert_many(document_list)
        client.close()