def spider_website(self, keyword): search = self.client.find_element_by_name("query") search.send_keys(keyword) search_submit = self.client.find_element_by_xpath('/html/body/div/div/div[1]/div/form/table/tbody/tr/td[3]/input') search_submit.click() person = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div/div/form/div/div/table/tbody/tr[1]/td[2]/a') person.click() #recent = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a') more = self.client.find_elements_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a') while more: counts = self.client.find_elements_by_css_selector("#recent>div>div>div[id^='u_0_']") cts = len(counts) + 1 print cts for count in range(1,cts): print count try: content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[1]/div[2]/span").text except: continue #content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div/div/div/div[" + str(count)+ "]/div/div[2]/div/a").text #print content #try: c_time = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[1]").text created_time = time2stamp(c_time) created_time_timestamp =int(time.mktime(time.strptime(created_time,'%Y-%m-%d %H:%M:%S'))) #except: #continue per_favourity = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[2]/span[1]/a").text per_favourity = per_favourity.replace(',','') per_favourity_count = re.split(' ',per_favourity)[0] item = { 'created_time_timestamp':created_time_timestamp, 'content':content, 'per_favourity_count':per_favourity_count, 'author':'tsaiingwen', 'lang':'' } self.process_item(item) #print c_time try: more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a') more.click() except: break
def spider_website(self, keyword): search = self.client.find_element_by_name("query") search.send_keys(keyword) search_submit = self.client.find_element_by_xpath('/html/body/div/div/div[1]/div/form/table/tbody/tr/td[3]/input') search_submit.click() person = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div/div/form/div/div/table/tbody/tr[1]/td[2]/a') person.click() #recent = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a') more = self.client.find_elements_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a') while more: counts = self.client.find_elements_by_css_selector("#recent>div>div>div[id^='u_0_']") cts = len(counts) + 1 print cts for count in range(1,cts): print count try: content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[1]/div[2]/span").text except: content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div/div/div/div[" + str(count)+ "]/div/div[2]/div/a").text #print content #try: c_time = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[1]").text created_time = time2stamp(c_time) created_time_timestamp =int(time.mktime(time.strptime(created_time,'%Y-%m-%d %H:%M:%S'))) #except: #continue per_favourity = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[" + str(count)+ "]/div[2]/div[2]/span[1]/a").text per_favourity = per_favourity.replace(',','') per_favourity_count = re.split(' ',per_favourity)[0] item = { 'created_time_timestamp':created_time_timestamp, 'content':content, 'per_favourity_count':per_favourity_count, 'author':'llchu', 'lang':'' } self.process_item(item) #print c_time try: more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div[2]/div[2]/div/div[2]/a') more.click() except: break
def spider_website(self, keyword): search = self.client.find_element_by_name("query") search.send_keys(keyword) search_submit = self.client.find_element_by_xpath('/html/body/div/div/div[1]/div/form/table/tbody/tr/td[3]/input') search_submit.click() person = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div/div/form/div/div/table/tbody/tr[1]/td[2]/a') person.click() history2015 = self.client.find_element_by_link_text(u'2015年') if history2015: history2015.click() more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div/div[2]/a') while more: counts = self.client.find_elements_by_css_selector("div[id^='u_0_']") cts = len(counts) + 1 print cts for count in range(1,cts): print count try: content = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div/div/div[2]/div/div[" + str(count)+ "]/div/div[2]/span").text except: continue c_time = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div/div/div[2]/div/div[" + str(count)+ "]/div[2]/div").text created_time = time2stamp(c_time) created_time_timestamp =int(time.mktime(time.strptime(created_time,'%Y-%m-%d %H:%M:%S'))) per_favourity = self.client.find_element_by_xpath("/html/body/div/div/div[2]/div/div/div/div/div[2]/div/div[" + str(count)+ "]/div[2]/div[2]/span/a").text per_favourity = per_favourity.replace(',','') per_favourity_count = re.split(' ',per_favourity)[0] item = { 'created_time_timestamp':created_time_timestamp, 'content':content, 'per_favourity_count':per_favourity_count, 'author':'soong', 'lang':'' } #self.process_item(item) #print per_favourity_count more = self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div/div[2]/a').text if(more == u'更多'): self.client.find_element_by_xpath('/html/body/div/div/div[2]/div/div/div/div[2]/a').click() else: break