class FengReceptorContent(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "feng_receptor_content" self.finished_ids = "feng_receptor_content" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath(".//*[contains(@class, 'newLine-4rktaWav')]") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return texts = href_contens[0].xpath( ".//*[contains(@class, 'time-RyJJYUOX')]/text()") time_source = ''.join(texts).strip() self.doraemon.hashSet(self.finished_ids, current_url, current_url) data = {'id': key, 'url': current_url, 'date': time_source} print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) file_path = '/home/dev/Data/rsyncData/test/feng_receptor.csv' items = self.file.readFromCSV(file_path) items.pop(0) for item in items: key = item[0] if key not in all_finished_id: name = key.strip() url = item[1] new_urls.append([url, name]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class ChuansongmeReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/" self.mongo = "gongzhonghao_test" self.finished_ids = "gongzhonghao_test" self.log_path = "/home/dev/Data/rsyncData/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_item = html.xpath("./*[contains(@class, 'pagedlist_item')]") if len(href_item) == 0: print 'No data for: {0}'.format(key) return self.doraemon.hashSet(self.finished_ids, key, key) data = { 'id': key, 'url': current_url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/gongzhonghao_test.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: if key not in all_finished_id: tmp_url = "https://chuansongme.com/account/{0}".format(key) new_urls.append([tmp_url, key]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class XueqiuReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "xueqiu_test" self.finished_ids = "xueqiu_test" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath( ".//*[contains(@class, 'search__user__card__content')]") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return for item in href_contens: href = item.xpath(".//*[contains(@class, 'user-name')]/@href") title_content = item.xpath( ".//*[contains(@class, 'user-name')]//span/text()") title = "".join(title_content).strip() if len(href) > 0 and title == key: url = "https://xueqiu.com/u{0}".format(href[0]) self.doraemon.hashSet(self.finished_ids, url, url) data = {'id': key, 'url': url} print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/xueqiu.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: if key not in all_finished_id: name = key.strip() tmp_url = "https://xueqiu.com/k?q={0}".format(name) new_urls.append([tmp_url, name]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)
class WoshipmReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "whoispm_receptor" self.finished_ids = "woshipm_receptor" self.log_path = "/home/dev/Data/rsyncData/test/" self.regx = re.compile("/u/[0-9]{0,}") def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath("./a") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return for item in href_contens: href = item.xpath("@href") title_content = item.xpath(".//text()") title = "".join(title_content).strip() if len(href) > 0 and title == key: isValidUrl = self.regx.match(href[0]) if isValidUrl is None: print 'Invalid url for not match: {0}'.format(href[0]) continue url = "http://www.woshipm.com{0}".format(href[0]) self.doraemon.hashSet(self.finished_ids, url, url) data = { 'id': key, 'url': url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/woshipm_receptor.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: key = key.strip() if key not in all_finished_id: name = key.strip() tmp_url = "http://www.woshipm.com/search-posts?k={0}".format(name) new_urls.append([tmp_url, name]) else: print 'Finished or no data for {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class FengReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "feng_receptor" self.finished_ids = "feng_receptor" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) key = response['request_title'].strip() str = response['response'].page_source.encode('utf-8') str_n = str[str.find('(') + 1:-21] str_n = str_n.replace('null', 'None') dics = eval(str_n) if len(dics['items']) == 0: print 'No data for: {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) return for item in dics['items']: name = item['name'].replace('<','').replace('em>','').replace('\\/','') id = item['id'] if len(id) > 0 and name == key: url = "https://feng.ifeng.com/author/{0}".format(id) self.doraemon.hashSet(self.finished_ids, key, key) data = { 'id': key, 'url': url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/feng_receptor.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: key = key.strip() if key not in all_finished_id: name = key.strip() tmp_url = "https://so.v.ifeng.com/websearch/ifeng-search-server/sub/websearch?k={0}&page=1&distinct=1&n=10&hl=1&os=ios&gv=6.2.5&uid=70b6a1d8f6c64618bf9dfa092fc4e34c&callback=getData".format(name) new_urls.append([tmp_url, name]) else: print 'Finished or no data for {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)