Пример #1
0
    def spider_website(self, keyword):

        search = self.client.find_element_by_name("query")
        search.send_keys(keyword)
        search_submit = self.client.find_element_by_xpath('/html/body/div/div/div[1]/div/form/table/tbody/tr/td[3]/input')
        search_submit.click()
        person = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div/div/form/div/div/table/tbody/tr[1]/td[2]/a')
        person.click()

        #recent = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a')
        
        more = self.client.find_elements_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a')
        while more:
            
            counts = self.client.find_elements_by_css_selector("#recent>div>div>div[id^='u_0_']")
            cts = len(counts) + 1
            print cts
            
            for count in range(1,cts):
                print count
                
                try:
                    content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[1]/div[2]/span").text
                except:
                    continue
                    #content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div/div/div/div[" + str(count)+ "]/div/div[2]/div/a").text
                    #print content
                #try:
                c_time = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[1]").text
                created_time = time2stamp(c_time)
                created_time_timestamp =int(time.mktime(time.strptime(created_time,'%Y-%m-%d %H:%M:%S')))
                #except:
                    #continue
                per_favourity = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[2]/span[1]/a").text
                per_favourity = per_favourity.replace(',','')
                per_favourity_count = re.split(' ',per_favourity)[0]
                
                item = {
                    'created_time_timestamp':created_time_timestamp,
                    'content':content,
                    'per_favourity_count':per_favourity_count,
                    'author':'tsaiingwen',
                    'lang':''
                }
                self.process_item(item)
                #print c_time
            try:
                more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a')
                more.click()                                     
            except:
                break
Пример #2
0
    def spider_website(self, keyword):

        search = self.client.find_element_by_name("query")
        search.send_keys(keyword)
        search_submit = self.client.find_element_by_xpath('/html/body/div/div/div[1]/div/form/table/tbody/tr/td[3]/input')
        search_submit.click()
        person = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div/div/form/div/div/table/tbody/tr[1]/td[2]/a')
        person.click()

        #recent = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a')
        
        more = self.client.find_elements_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a')
        while more:
            
            counts = self.client.find_elements_by_css_selector("#recent>div>div>div[id^='u_0_']")
            cts = len(counts) + 1
            print cts
            
            for count in range(1,cts):
                print count
                
                try:
                    content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[1]/div[2]/span").text
                except:
                    content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div/div/div/div[" + str(count)+ "]/div/div[2]/div/a").text
                    #print content
                #try:
                c_time = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[1]").text
                created_time = time2stamp(c_time)
                created_time_timestamp =int(time.mktime(time.strptime(created_time,'%Y-%m-%d %H:%M:%S')))
                #except:
                    #continue
                per_favourity = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[2]/span[1]/a").text
                per_favourity = per_favourity.replace(',','')
                per_favourity_count = re.split(' ',per_favourity)[0]
                
                item = {
                    'created_time_timestamp':created_time_timestamp,
                    'content':content,
                    'per_favourity_count':per_favourity_count,
                    'author':'llchu',
                    'lang':''
                }
                self.process_item(item)
                #print c_time
            try:
                more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a')
                more.click()                                     
            except:
                break
Пример #3
0
    def spider_website(self, keyword):

        search = self.client.find_element_by_name("query")
        search.send_keys(keyword)
        search_submit = self.client.find_element_by_xpath('/html/body/div/div/div[1]/div/form/table/tbody/tr/td[3]/input')
        search_submit.click()
        person = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div/div/form/div/div/table/tbody/tr[1]/td[2]/a')
        person.click()

        history2015 = self.client.find_element_by_link_text(u'2015年')
        if history2015:
            history2015.click()
        more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div/div[2]/a')
        while more:
            counts = self.client.find_elements_by_css_selector("div[id^='u_0_']")
            cts = len(counts) + 1
            print cts
            
            for count in range(1,cts):
                print count
                try:
                    content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div/div/div[2]/div/div[" + str(count)+ "]/div/div[2]/span").text
                except:
                    continue
                c_time = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div/div/div[2]/div/div[" + str(count)+ "]/div[2]/div").text
                created_time = time2stamp(c_time)
                created_time_timestamp =int(time.mktime(time.strptime(created_time,'%Y-%m-%d %H:%M:%S')))
                
                per_favourity = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div/div/div[2]/div/div[" + str(count)+ "]/div[2]/div[2]/span/a").text
                per_favourity = per_favourity.replace(',','')
                per_favourity_count = re.split(' ',per_favourity)[0]
                
                item = {
                    'created_time_timestamp':created_time_timestamp,
                    'content':content,
                    'per_favourity_count':per_favourity_count,
                    'author':'soong',
                    'lang':''
                }
                #self.process_item(item)
                
                #print per_favourity_count
            more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div/div[2]/a').text
            if(more == u'更多'):
                self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div/div[2]/a').click()
            else:
                break