def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): print(url) if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers, REGION_NAME)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'homepage': re.compile('http://www.sfda.gov.cn/WS01/(.*?)/$'), 'detail': re.compile('http://www.sfda.gov.cn/WS01/(.*?)/(.*?).html') }) if not hasattr(process, '_cache'): setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue print label if label == 'homepage': content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, refresh=True) dom = lxml.html.fromstring(content) total_content = dom.xpath('//td[@class="pageTdSTR15"]//text()')[0] total_page = int(re.findall(u'共(\d+)页', total_content)[0]) for page in range(2, total_page): print(page) hrefs = dom.xpath('//td[@class="ListColumnClass15"]/a/@href') urls = [] for href in hrefs: href = re.sub(u'\.\.', u'', href) # 网址是以..开头的相对路径 href = 'http://www.sfda.gov.cn/WS01' + href urls.append(href) manager.put_urls_enqueue(batch_id, urls) page_url = '{}index_{}.html'.format(url, page) content = process._downloader.downloader_wrapper( page_url, batch_id, gap, timeout=timeout, refresh=True) dom = lxml.html.fromstring(content) return True elif label == 'detail': return parse_page(url)
def process(url, batch_id, parameter, manager, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers, REGION_NAME)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CacheS3(head + '-json-' + tail)) if not hasattr(process, '_regs'): setattr(process, '_regs', re.compile(urlparse.urljoin(SITE, 'search\?word=(.+)'))) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') word = urllib.unquote(process._regs.match(url).group(1)) if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start download') refresh = False for _ in range(5): try: content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=refresh) if content == '': return False if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') result = parse_search_json_v0707(content, word) break except: refresh = True if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start post json') return process._cache.post(url, json.dumps(result))
def process(url, batch_id, parameter, manager, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper('s3', headers, REGION_NAME)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('start download') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='gb18030') if kwargs and kwargs.get("debug"): print(len(content), "\n", content[:1000]) if content is False: return False content_urls = [] get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing') tree = lxml.html.fromstring(content) urls = tree.xpath('//td[@class="title"]/a/@href') if urls == []: get_logger(batch_id, today_str, '/opt/service/log/').info('start download2') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=True) if content is False: return False tree = lxml.html.fromstring(content) urls = tree.xpath('//td[@class="title"]/a/@href') for url in urls: content_urls.append(urlparse.urljoin('http://data.eastmoney.com/', url)) get_logger(batch_id, today_str, '/opt/service/log/').info('start put content') manager.put_urls_enqueue('dongcaigonggao-content-20160620', content_urls) return True
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr(process, '_regs', { 'home_page' : re.compile('http://www.zyctd.com/zixun/'), 'list_page' : re.compile('http://www.zyctd.com/zixun-(\d+).html') }) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout= int(timeout) gap = max(gap - other_batch_process_time, 0) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue print label if label == 'home_page': url_pattern = 'http://www.zyctd.com/zixun-{}.html' content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout = timeout, encoding = 'utf-8', refresh = True) page_content = re.findall('var pageCount = parseInt\(\'(\d+)\'\)', content) # 在网页元素中没有,通过js代码段获得 if not page_content: return False page_num = int(page_content[0]) urls = [] for page in range(2,page_num): # 根据页码将所有页加入队列 urls.append(url_pattern.format(page)) manager.put_urls_enqueue(batch_id, urls) result_list = parse_list_page(content) # 首页本身作为第一页也有新闻信息,也要进行分析 return process._cache.post(url, json.dumps(result_list, ensure_ascii=False), refresh=True) elif label == 'list_page': content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout = timeout, encoding = 'utf-8', refresh = True) result_list = parse_list_page(content) return process._cache.post(url, json.dumps(result_list, ensure_ascii=False), refresh=True)
def process(url, batch_id, parameter, manager, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {"Host": domain_name} setattr(process, '_downloader', DownloadWrapper('s3', headers, REGION_NAME)) if not hasattr(process, '_cache'): setattr(process, '_cache', CacheS3(batch_id.split('-', 1)[0]+'-json')) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout= int(timeout) today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('start download content') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='gb18030') if kwargs and kwargs.get("debug"): print(len(content), "\n", content[:1000]) if content is False: return False get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing content') begin = content.find('<div class="mainbox">') end = content.find('<div id="footer">', begin) tree = lxml.html.fromstring(content[begin:end]) title = tree.xpath('//div[@class="content"]/h4/text()') if isinstance(title, list) and len(title) > 0: title = title[0] else: title = None public_date = tree.xpath('//div[@class="content"]/h5/text()') if isinstance(public_date, list) and len(public_date) > 0: public_date = public_date[0] else: public_date = None body = tree.xpath('//div[@class="content"]//pre/text()') if isinstance(body, list) and len(body) > 0: body = body[0] else: body = None notice_content = json.dumps({'url': url, 'title': title, 'public_date': public_date, 'body': body}) get_logger(batch_id, today_str, '/opt/service/log/').info('start post json') ret = process._cache.post(url, notice_content) return ret
def process(url, batch_id, parameter, *args, **kwargs): if not hasattr(process, '_downloader'): setattr( process, '_downloader', DownloadWrapper(CACHE_SERVER, THE_CONFIG['crawl_http_headers'])) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout=timeout, encoding=THE_CONFIG['crawl_result_content_encoding'], refresh=THE_CONFIG['crawl_refresh']) if content is False: return False return True
class ZhidaoPrefetch(object): def __init__(self, config): print config, "-----" self.config = config self.counter = collections.Counter() self.cache = Cache(self.config["batch_ids"]["json"], self.config["cache_server"]) self.downloader = DownloadWrapper(self.config["cache_server"], self.config["http_headers"]) def is_debug(self): return self.config.get("debug", False) def zhidao_results(self, qids): q_jsons = [] for qid in qids: q_json = self.zhidao_question(qid) if q_json is False: continue q_json["list_answers"] = [] for rid in q_json["answer_ids"][:3]: a_json = self.zhidao_answer(qid, rid) if a_json is False: continue q_json["list_answers"].append(a_json) q_jsons.append(q_json) return q_jsons def zhidao_question(self, qid): question_url = "http://zhidao.baidu.com/question/{}.html".format(qid) if self.is_debug(): print question_url ret = self.downloader.downloader_wrapper( question_url, self.config["batch_ids"]["question"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"]) if ret is False: return False q_json = generate_question_json(qid, ret) if q_json is None or q_json == {}: return False success = self.cache.post(question_url, q_json) return q_json def zhidao_answer(self, qid, rid): answer_url = ("http://zhidao.baidu.com/question/api/mini?qid={}" "&rid={}&tag=timeliness".format(qid, rid)) #print self.config["crawler"] if self.is_debug(): print answer_url ret = self.downloader.downloader_wrapper( answer_url, self.config["batch_ids"]["answer"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"]) if ret is False: return False try: a_json = generate_answer_json(ret) except: return False success = self.cache.post(answer_url, a_json) return a_json def zhidao_search(self, query, page_number=None, start_result_index=0): if isinstance(query, unicode): query = query.encode("utf-8") if page_number is None or page_number == 0: query_url = "http://zhidao.baidu.com/search/?word={}".format( urllib.quote(query)) else: query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format( page_number * 10, urllib.quote(query)) if self.is_debug(): print query_url # query_url = "http://zhidao.baidu.com/search?word={}".format(quote_word) #print query #print query_url ret = self.downloader.downloader_wrapper( query_url, self.config["batch_ids"]["search"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"], refresh=False) # resp.headers: "content-type": "text/html;charset=UTF-8", # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False else: return parse_search_json_v0615( ret, start_result_index=start_result_index) def run_query(self, query, max_page): self.counter["query"] += 1 qids_select = set() result_all = [] for page_number in range(max_page): print "==== page ", page_number, query self.counter["page"] += 1 result_local = self.zhidao_search(query, page_number, len(result_all)) #print json.dumps( result_local, ensure_ascii=False, indent=4, sort_keys=True) result_all.extend(result_local) self.counter["q_total"] += len(result_local) for item in result_local: item["query"] = query if type(query) != unicode: item["query"] = query.decode("utf-8") #print item if item["source"] == "recommend" or (item["cnt_like"] >= 3): self.counter["q_good"] += 1 qids_select.add(item["question_id"]) print item["source"], item["cnt_like"], item[ "cnt_answer"], item['question'], "<----", item[ 'answers'] else: print item["source"], item["cnt_like"], item[ "cnt_answer"], item['question'] print datetime.datetime.now().isoformat(), self.counter return result_all #qajson = self.zhidao_results(qids_select) #print json.dumps(qajson, ensure_ascii=False, indent=4) def run_query_entity(self): filename = getTheFile("seed_entity.human.txt") with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue self.run_query(line, 10) def run_query_batch(self, filename, limit): with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue self.run_query(line, limit) def run_gen_url_search_realtime(self, filename): lines = libfile.file2list(filename) visited = set() for line in sorted(lines): for query_parser in [0]: query_url, qword = zhidao_fetch.get_search_url_qword( line, query_parser=query_parser) if query_url in visited: continue visited.add(query_url) print qword, query_url print len(visited) filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "_urls.txt"))) libfile.lines2file(sorted(list(visited)), filename_output) def run_test_search_realtime(self, filename, limit): results = [] counter = collections.Counter() with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue ret = self.run_query(line, limit) counter["query"] += 1 for item in ret: #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter def run_get_best_search_realtime(self, filename): results = [] counter = collections.Counter() lines = libfile.file2list(filename) for query_parser in [0]: for line in sorted(lines): cnt_label = "query_{}".format(query_parser) if counter[cnt_label] % 10 == 0: print datetime.datetime.now().isoformat( ), counter[cnt_label], line counter[cnt_label] += 1 ret_one = search_zhidao_best(line, query_filter=0, query_parser=query_parser) if ret_one: item = ret_one["best_qapair"] print "=====>", line print "------", item["match_score"], item["question"] print item["answers"], "*******", item["answers_raw"][ len(item["answers"]):] for p in ["query"]: item[p] = ret_one[p] #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr(process, '_regs', { 'main': re.compile(r'http://china.chemnet.com/hot-product/(\w|\d+).html'), 'prd': re.compile(r'http://china.chemnet.com/product/pclist--(.+?)--0.html'), 'comps': re.compile(r'http://china.chemnet.com/product/search.cgi') }) def safe_state(statement): return statement[0] if statement else '' def xpath_string(n): return "//*[@id=\"main\"]/div[1]/div[1]/table/tr[" + str(n) + "]/td[2]/text()" method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout= int(timeout) compspat = 'http://china.chemnet.com/product/search.cgi?skey={};use_cas=0;f=pclist;p={}' today_str = datetime.now().strftime('%Y%m%d') # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start download') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, # encoding='gb18030', refresh=True ) # print(content) if content == '': get_logger(batch_id, today_str, '/opt/service/log/').info(url + ' no content') return False # if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue page = etree.HTML(content.replace('<sub>', '').replace('</sub>', '')) if label == 'main': # print("add chems") chems = page.xpath("//*[@id=\"main\"]/div[1]/div[2]/dl/dd/ul/li/p[2]/a/@href") # links for chems in main page chems = [ urlparse.urljoin(SITE, chem) for chem in chems] get_logger(batch_id, today_str, '/opt/service/log/').info('adding chems urls into queue') manager.put_urls_enqueue(batch_id, chems) return True elif label == 'prd': chem_uri = m.group(1) chem_name = page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[1]/td[2]/text()")[0] get_logger(batch_id, today_str, '/opt/service/log/').info(chem_name + " main page") comps = page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/form/table/tr[1]/td[2]/a[1]") pagetext = page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/h6/div/text()[1]") # print(pagetext[0]) total = int(re.compile(r'共有(\d+)条记录').search(pagetext[0].encode('utf-8')).group(1)) total = total // 10 + 1 if total % 10 != 0 else total // 10 dic = { u'source': url, u'中文名称': page.xpath(xpath_string(1))[0] if page.xpath(xpath_string(1)) else '', u'英文名称': page.xpath(xpath_string(2))[0] if page.xpath(xpath_string(2)) else '', u'中文别名': page.xpath(xpath_string(3))[0] if page.xpath(xpath_string(3)) else '', u'CAS_RN': page.xpath(xpath_string(4))[0] if page.xpath(xpath_string(4)) else '', u'EINECS': page.xpath(xpath_string(5))[0] if page.xpath(xpath_string(5)) else '', u'分子式': page.xpath(xpath_string(6))[0] if page.xpath(xpath_string(6)) else '', u'分子量': page.xpath(xpath_string(7))[0] if page.xpath(xpath_string(7)) else '', u'危险品标志': page.xpath(xpath_string(8))[0].strip() if page.xpath(xpath_string(8)) else '', u'风险术语': page.xpath(xpath_string(9))[0].strip() if page.xpath(xpath_string(9)) else '', u'安全术语': page.xpath(xpath_string(10))[0].strip() if page.xpath(xpath_string(10)) else '', u'物化性质': page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[11]/td[2]/p/text()") if page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[11]/td[2]/p/text()") else [], u'用途': page.xpath(xpath_string(12))[0] if page.xpath(xpath_string(12)) else '', u'上游原料': page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[14]/td[2]/a/text()') if page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[14]/td[2]/a/text()') else [], u'下游产品': page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[15]/td[2]/a/text()') if page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[15]/td[2]/a/text()') else [], } data = json.dumps(dic, encoding='utf-8', ensure_ascii=False) new_urls = [] for t in range(total): new_url = compspat.format(chem_uri, str(t)) get_logger(batch_id, today_str, '/opt/service/log/').info("new url" + new_url) new_urls.append(new_url) manager.put_urls_enqueue(batch_id, new_urls) get_logger(batch_id, today_str, '/opt/service/log/').info('start posting prd page to cache') return process._cache.post(url, data) else: chem_name = page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[1]/td[2]/text()")[0] total = len(page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/form")) # total num of suppliers dic = '' for i in range(1, total + 1): c = safe_state(page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/form[{}]".format(str(i)))) if c is '': break comp = {} comp[u'source'] = url comp[u'chem_name'] = chem_name comp[u'name'] = safe_state(c.xpath(".//table/tr[1]/td[2]/a[1]/text()")) comp[u'tel'] = safe_state(c.xpath(".//table/tr[2]/td[2]/text()")) comp[u'fax'] = safe_state(c.xpath(".//table/tr[3]/td[2]/text()")) comp[u'website'] = safe_state(c.xpath(".//table/tr[4]/td[2]/a/text()")) dic += json.dumps(comp, encoding='utf-8', ensure_ascii=False) + '\n' dic = dic.strip() get_logger(batch_id, today_str, '/opt/service/log/').info('start posting companies to cache') return process._cache.post(url, dic)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): # 药材的详情页涉及2个部分:价格历史history和边栏sidebar,以下的ytw/second/是价格历史的url,返回一个大的json; # 所以在最后处理的时候还要额外向另一个url发送一次请求,以获得边栏信息,由于要储存到同一个result.json中,因此不再放入队列,而是直接在process里完成 today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('process {}'.format(url)) if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'list_view': re.compile( 'http://www.yt1998.com/price/nowDayPriceQ\!getPriceList.do\?pageIndex=(\d+)&pageSize=(\d+)' ), 'detail_view': re.compile( 'http://www.yt1998.com/ytw/second/priceInMarket/getPriceHistory.jsp\?ycnam=(.*)&guige=(.*)&chandi=(.*)&market=(.*)' ) }) if not hasattr(process, '_sellerMarket_list'): setattr(process, '_sellerMarket_list', ['', u'亳州市场', u'安国市场', u'玉林市场', u'成都市场']) # http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex=0&pageSize=500 if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue get_logger(batch_id, today_str, '/opt/service/log/').info('label : {}'.format(label)) if label == 'list_view': get_logger(batch_id, today_str, '/opt/service/log/').info(label) content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) get_logger(batch_id, today_str, '/opt/service/log/').info('download ok') get_logger(batch_id, today_str, '/opt/service/log/').info(len(content)) list_item = json.loads(content) urls = [] for detail_item in list_item[u'data']: detail_url_pattern = 'http://www.yt1998.com/ytw/second/priceInMarket/getPriceHistory.jsp?ycnam={}&guige={}&chandi={}&market={}' ycnam = str(detail_item[u'ycnam']) chandi = str(detail_item[u'chandi']) market = str(detail_item[u'market']) guige = str(detail_item[u'guige']) detail_url = detail_url_pattern.format(urllib.quote(ycnam), urllib.quote(guige), urllib.quote(chandi), urllib.quote(market)) urls.append(detail_url) get_logger(batch_id, today_str, '/opt/service/log/').info('len urls') get_logger(batch_id, today_str, '/opt/service/log/').info(len(urls)) manager.put_urls_enqueue(batch_id, urls) total_num = int(list_item[u'total']) pageIndex = int(m.group(1)) pageSize = int(m.group(2)) if pageIndex == 0: print(total_num // pageSize) for index in range(1, total_num // pageSize + 1): get_logger(batch_id, today_str, '/opt/service/log/').info('iiiiiindex') get_logger(batch_id, today_str, '/opt/service/log/').info(index) list_pattern = 'http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex={}&pageSize={}' list_url = list_pattern.format(index, pageSize) manager.put_urls_enqueue(batch_id, [list_url]) return True elif label == 'detail_view': get_logger(batch_id, today_str, '/opt/service/log/').info(label) ycnam = urllib.unquote(m.group(1)) guige = urllib.unquote(m.group(2)) chandi = urllib.unquote(m.group(3)) market = urllib.unquote(m.group(4)) content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) get_logger(batch_id, today_str, '/opt/service/log/').info(len(content)) history_item = json.loads(content) get_logger(batch_id, today_str, '/opt/service/log/').info('downloaded') price_history = {} for raw_daily_data in history_item[u'DayPriceData']: date = raw_daily_data[u'Date_time'] price = raw_daily_data[u'DayCapilization'] price_history[date] = price source_url = 'http://www.yt1998.com/priceHistory.html?keywords={}&guige={}&chandi={}&market={}' get_logger(batch_id, today_str, '/opt/service/log/').info('source') get_logger(batch_id, today_str, '/opt/service/log/').info( len(process._sellerMarket_list)) result_item = { 'name': ycnam, 'productGrade': guige, 'productPlaceOfOrigin': chandi, 'sellerMarket': process._sellerMarket_list[int(market)], 'price_history': price_history, 'source': source_url.format(ycnam, guige, chandi, market), } print(result_item) result_item['access_time'] = datetime.utcnow().isoformat( ) # 从上面source的赋值可看出每个item都对应不同的参数 return process._cache.post(url, json.dumps(result_item, ensure_ascii=False), refresh=True)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'main': re.compile( r'http://www.zysj.com.cn/zhongyaocai/index__\d+.html'), 'prd': re.compile( r'http://www.zysj.com.cn/zhongyaocai/yaocai_\w/(.+?).html') }) method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start download') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout) # print(content) if content == '': # print("no content") get_logger(batch_id, today_str, '/opt/service/log/').info(url + ' no content') return False # content.encoding='gb18030' # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue page = etree.HTML(content) if label == 'main': get_logger(batch_id, today_str, '/opt/service/log/').info("adding Chinese Meds") meds = page.xpath("//*[@id=\"list\"]/ul/li/a/@href" ) # links for meds in main page meds = [urlparse.urljoin(SITE, med) for med in meds] # print(meds[:5]) get_logger(batch_id, today_str, '/opt/service/log/').info('adding Meds urls into queue') manager.put_urls_enqueue(batch_id, meds) return True elif label == 'prd': med_name = page.xpath("//*[@id=\"article\"]/h1/text()")[0] get_logger(batch_id, today_str, '/opt/service/log/').info(med_name + " main page") # print(med_name,"main page") book_list = [] dictionary = {} books = content.split('<hr />') # 用来分开不同的药典 if len(books) == 2: # 只有一个药典的情况 books = [books[0]] else: # 有多个药典的情况 books = books[1:-1] for book in books: page = etree.HTML( book.replace('<strong>', '').replace('</strong>', '').replace( '<sub>', '').replace('</sub>', '')) med_info = page.xpath("//p/text()") data = {} # data['source'] = url dictionary['source'] = url # data['access_time'] = datetime.utcnow().isoformat() dictionary['access_time'] = datetime.utcnow().isoformat() data_list = [] for info in med_info: m = re.compile(r'【.+?】').match(info.encode('utf-8')) if m: prop = m.group(0)[3:-3] cleaned = re.sub(r'【.+?】', '', info.encode('utf-8')) data[prop] = cleaned data_list.append({prop: cleaned}) else: data[prop] += '\n' + info.encode('utf-8') data_list[-1][prop] += '\n' + info.encode('utf-8') book_name = data['摘录'] # dics[book_name] = data book_list.append({book_name: data_list}) # 为了保持原书籍的顺序,使用列表结构 dictionary[data['药材名称']] = book_list dictionary = json.dumps(dictionary, encoding='utf-8', ensure_ascii=False) get_logger( batch_id, today_str, '/opt/service/log/').info('start posting prd page to cache') return process._cache.post(url, dictionary)
def process(url, batch_id, parameter, manager, *args, **kwargs): # 药材的详情页涉及2个部分:价格历史history和边栏sidebar,以下的ytw/second/是价格历史的url,返回一个大的json; # 所以在最后处理的时候还要额外向另一个url发送一次请求,以获得边栏信息,由于要储存到同一个result.json中,因此不再放入队列,而是直接在process里完成 today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('process {}'.format(url)) if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers, REGION_NAME)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'home': re.compile('http://www.yt1998.com/variteyIndexInfo.html'), 'kind': re.compile('http://www.yt1998.com/issueIndexInfo.html\?code='), 'history': re.compile( 'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp\?code=(\d+)&type=1&varitey_name=(.*)' ) #这是价格历史的url }) # if not hasattr(process, '_cache'): # head, tail = batch_id.split('-') # setattr(process, '_cache', CacheS3(head + '-json-' + tail)) if not hasattr(process, '_next_patterns'): setattr( process, '_next_patterns', { 'home': 'http://www.yt1998.com/issueIndexInfo.html?code={}', #the format of kind 'kind': 'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp?code={}&type=1&varitey_name={}', #the format of history 'history': 'http://www.yt1998.com/variteyIndexInfo.html?varitey_code={}' #the format of sidebar }) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue get_logger(batch_id, today_str, '/opt/service/log/').info('label : {}'.format(label)) if label in [ 'home', 'kind' ]: #I handle home-page and kind-page in one code block cuz they are in same web format content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) dom = lxml.html.fromstring(content) dd_labels = dom.xpath('//dd') urls = [] for single_dd in dd_labels: rels = single_dd.xpath('.//@rel') if not rels: get_logger(batch_id, today_str, '/opt/service/log/').info( 'wrong rels content : {}'.format(rels)) continue for rel in rels: code = rel.split( ',' )[-2] #在home页面,rel的格式为 'Z,家种类' code为Z;在kind页面,rel格式为'Z,家种类,000001,枸杞',code为000001 if label == 'home': urls.append(process._next_patterns[label].format(code)) else: # label == 'kind' name = str(rel.split(',')[-1]) urls.append(process._next_patterns[label].format( code, urllib.quote(name))) manager.put_urls_enqueue(batch_id, urls) elif label == 'history': #开始提取单种药品数据 #由于之前的设计,传进来的是历史价格的url,在更新的时候已经用不到,但是为了尽量一致,减少变动,采用传入 #历史记录url,再提取其中的参数组成边栏url,发送请求得到当日价格的逻辑 code = m.group(1) name = urllib.unquote(m.group(2)) sidebar_url = process._next_patterns[label].format(code) sidebar_content = process._downloader.downloader_wrapper( sidebar_url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) sidebar_dom = lxml.html.fromstring(sidebar_content) sidebar_label = sidebar_dom.xpath( '//div[@class="box-con-r fr"]/table//tr') if not isinstance(sidebar_label, list) or len(sidebar_label) != 19: get_logger(batch_id, today_str, '/opt/service/log/').info('not legal list!') return False for index in range(1, 16): line_content = sidebar_label[index].xpath( './td/text()') #line content格式为 权重比:0.0278、市净率:2.00... parts = line_content[0].split( ':') # chinese colon :left part as key,right part as value if parts[0] == u'当前价格': # print ('相等') today_price = parts[1] break result_item = {} result_item['today_price'] = today_price result_item['name'] = name result_item['url'] = sidebar_url return True # 之后更改为新的cache
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): # 药材的详情页涉及2个部分:价格历史history和边栏sidebar,以下的ytw/second/是价格历史的url,返回一个大的json; # 所以在最后处理的时候还要额外向另一个url发送一次请求,以获得边栏信息,由于要储存到同一个result.json中,因此不再放入队列,而是直接在process里完成 today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('process {}'.format(url)) if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'home': re.compile('http://www.yt1998.com/variteyIndexInfo.html'), 'kind': re.compile('http://www.yt1998.com/issueIndexInfo.html\?code='), 'history': re.compile( 'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp\?code=(\d+)&type=1&varitey_name=(.*)' ) #这是价格历史的url }) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_next_patterns'): setattr( process, '_next_patterns', { 'home': 'http://www.yt1998.com/issueIndexInfo.html?code={}', #the format of kind 'kind': 'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp?code={}&type=1&varitey_name={}', #the format of history 'history': 'http://www.yt1998.com/variteyIndexInfo.html?varitey_code={}' #the format of sidebar }) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue get_logger(batch_id, today_str, '/opt/service/log/').info('label : {}'.format(label)) if label in [ 'home', 'kind' ]: #I handle home-page and kind-page in one code block cuz they are in same web format content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) dom = lxml.html.fromstring(content) dd_labels = dom.xpath('//dd') urls = [] for single_dd in dd_labels: rels = single_dd.xpath('.//@rel') if not rels: get_logger(batch_id, today_str, '/opt/service/log/').info( 'wrong rels content : {}'.format(rels)) continue for rel in rels: code = rel.split( ',' )[-2] #在home页面,rel的格式为 'Z,家种类' code为Z;在kind页面,rel格式为'Z,家种类,000001,枸杞',code为000001 if label == 'home': urls.append(process._next_patterns[label].format(code)) else: # label == 'kind' name = str(rel.split(',')[-1]) urls.append(process._next_patterns[label].format( code, urllib.quote(name))) manager.put_urls_enqueue(batch_id, urls) elif label == 'history': #开始提取单种药品数据 code = m.group(1) name = urllib.unquote(m.group(2)) sidebar_url = process._next_patterns[label].format(code) sidebar_content = process._downloader.downloader_wrapper( sidebar_url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) sidebar_dom = lxml.html.fromstring(sidebar_content) sidebar_label = sidebar_dom.xpath( '//div[@class="box-con-r fr"]/table//tr') if not isinstance(sidebar_label, list) or len(sidebar_label) != 19: get_logger(batch_id, today_str, '/opt/service/log/').info('not legal list!') return False sidebar_item = {} # 边栏信息 for index in range(1, 16): line_content = sidebar_label[index].xpath( './td/text()') #line content格式为 权重比:0.0278、市净率:2.00... parts = line_content[0].split( ':') # chinese colon :left part as key,right part as value sidebar_item[parts[0]] = parts[1] line_content = sidebar_label[16].xpath( './th/text()') #最后更新时间的样式与其他不同,为th parts = line_content[0].split(':') sidebar_item[parts[0]] = parts[1] history_content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) if history_content == '': return False get_logger( batch_id, today_str, '/opt/service/log/').info('history downloading finished') history_item = json.loads(history_content)[ u'DayMonthData'] #从结果中提取每天数据 price_history = {} #价格历史 for raw_daily_data in history_item: date = raw_daily_data[u'Date_time'] price = raw_daily_data[u'DayCapilization'] price_history[date] = price result_item = {} result_item['name'] = name result_item['info'] = sidebar_item result_item['price_history'] = price_history result_item['source'] = sidebar_url return process._cache.post(url, json.dumps(result_item, ensure_ascii=False), refresh=True)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): print(url) if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers, REGION_NAME)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'column_id': re.compile('(\d+)'), 'pages_view': re.compile( 'http://www.yt1998.com/ytw/second/marketMgr/query.jsp\?lmid=(\d+?)&(.*)' ) }) if not hasattr(process, '_cache'): setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue if label == 'column_id': query_url = 'http://www.yt1998.com/ytw/second/marketMgr/query.jsp' column_id = url page_size = 10 data = { 'lmid': column_id, # 栏目id,lm是栏目的首字母! 9代表产地信息,1代表品种分析,3代表天天行情 # 'scid':'1', # 对于天天行情,存在scid=市场id,但是尝试不传递这个参数,就会返回所有市场的新闻。且在返回值内依然可以找到市场字段,不会丢失信息。 'pageIndex': '0', 'pageSize': page_size, 'times': '1', # 非必要参数 } content = process._downloader.downloader_wrapper(query_url, batch_id, gap, method='post', data=data, timeout=timeout, encoding='utf-8', refresh=True) news_info = json.loads(content) total = int(news_info[u'total']) # 得出新闻总数,以此生成子任务 url_pattern = 'http://www.yt1998.com/ytw/second/marketMgr/query.jsp?lmid={}×=1&pageIndex={}&pageSize={}' urls = [] for index in range(0, total / page_size + 1): url = url_pattern.format(column_id, index, page_size) if not check_date_ok(url): break urls.append(url) manager.put_urls_enqueue(batch_id, urls) elif label == 'pages_view': content = process._downloader.downloader_wrapper(url, batch_id, gap, method='get', timeout=timeout, encoding='utf-8', refresh=True) item = json.loads(content) news_data = item[u'data'] menu_dic = { '1': u'品种分析', '3': u'天天行情', '9': u'产地信息', } result_list = [] detail_pattern = 'http://www.yt1998.com/hqMinute--{}.html' for news in news_data: result = { 'news_title': news[u'title'], 'news_url': detail_pattern.format(news[u'acid']), 'news_desc': news[u'cont'].strip(), 'news_date': news[u'dtm'], 'news_keyword_list': [news[u'ycnam']], # ycname = 药材nam = 药材name 取名逻辑很复杂 'access_time': datetime.datetime.utcnow().isoformat(), 'market': news[u'market'], } result['news_type'] = menu_dic[news[u'lmid']] if news[u'lmid'] == '3': # 天天行情为快讯,是短新闻,不用再去取正文 result['news_content'] = result['news_desc'] else: # 其它栏目进行正文爬取 result['news_content'] = get_news_content( result['news_url'], batch_id, gap, timeout) result_list.append(result) return process._cache.post(url, json.dumps(result_list, ensure_ascii=False), refresh=True)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): home_page = 'http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=25&tableName=TABLE25&title=%B9%FA%B2%FA%D2%A9%C6%B7&bcId=124356560303886909015737447882' if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_reg'): setattr( process, '_reg', { 'detail': re.compile( 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp\?tableId=25&tableName=TABLE25&tableView=%B9%FA%B2%FA%D2%A9%C6%B7&Id=(\d+)' ), }) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) today_str = datetime.now().strftime('%Y%m%d') if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start download') data = { 'tableId': '25', 'State': '1', 'bcId': '124356560303886909015737447882', 'State': '1', 'curstart': 1, #here! 'State': '1', 'tableName': 'TABLE25', 'State': '1', 'viewtitleName': 'COLUMN167', 'State': '1', 'viewsubTitleName': 'COLUMN166,COLUMN170,COLUMN821', 'State': '1', 'tableView': '%E5%9B%BD%E4%BA%A7%E8%8D%AF%E5%93%81', 'State': '1', } if url == home_page: if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') page = 1 while 1: data['curstart'] = page content = process._downloader.downloader_wrapper( 'http://app1.sfda.gov.cn/datasearch/face3/search.jsp', batch_id, gap, method='post', timeout=timeout, refresh=True, data=data) # if page == 3: # return ids = re.findall(u'国产药品&Id=(\d+)', content) if not ids: break url_pattern = 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=25&tableName=TABLE25&tableView=%B9%FA%B2%FA%D2%A9%C6%B7&Id={}' urls = [] for drug_id in ids: url = url_pattern.format(drug_id) urls.append(url) manager.put_urls_enqueue(batch_id, urls) page += 1 if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info( 'going to page{}'.format(page)) return elif process._reg['detail'].match(url): content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout=timeout, ) if content == '': return False if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') dom = lxml.html.fromstring(content) table = dom.xpath('//tr') item = { 'license_number': table[1].xpath('./td')[1].xpath('./text()'), #[u'批准文号'], 'product_name_chs': table[2].xpath('./td')[1].xpath('./text()'), #[u'产品名称'], 'product_name_eng': table[3].xpath('./td')[1].xpath('./text()'), #[u'英文名称'], 'commodity_name_chs': table[4].xpath('./td')[1].xpath('./text()'), #[u'商品名'], 'drug_form': table[5].xpath('./td')[1].xpath('./text()'), #[u'剂型'], 'specification': table[6].xpath('./td')[1].xpath('./text()'), #[u'规格'], 'manufacturer_chs': table[7].xpath('./td')[1].xpath('./text()'), #[u'生产单位'], 'manuf_address_chs': table[8].xpath('./td')[1].xpath('./text()'), #[u'生产地址'], 'category': table[9].xpath('./td')[1].xpath('./text()'), #[u'产品类别'], 'license_data': table[11].xpath('./td')[1].xpath('./text()'), #[u'批准日期'], 'standard_code': table[12].xpath('./td')[1].xpath('./text()'), #[u'药品本位码'], 'standard_code_remark': table[13].xpath('./td')[1].xpath('./text()'), #[u'药品本位码备注'], 'source': [url], } for k, v in item.iteritems(): if len(v) > 0: item[k] = v[0] else: item[k] = None return process._cache.post(url, json.dumps(item, ensure_ascii=False))
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr(process, '_regs', { 'main': re.compile(r'http://www.kmzyw.com.cn/bzjsp/biz_price_search/price_index_search.jsp'), 'prd': re.compile(r'http://www.kmzyw.com.cn/bzjsp/Biz_price_history/price_history_search.jsp\?name=(.*?)') }) def timestamp2datetime(timestamp): if isinstance(timestamp, (int, long, float)): dt = datetime.utcfromtimestamp(timestamp) else: return "Not a valid timestamp" mid = '-0' if dt.month < 10 else '-' return str(dt.year) + mid + str(dt.month) post_form = { 'pagecode': None, # 'search_site': '%25E4%25BA%25B3%25E5%25B7%259E', } method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue if label == 'main': total_page = 10 # 初始化为一个较小的数,之后在获取页面内容后会更新此总页数 page_num = 1 while page_num < total_page + 1: post_form['pagecode'] = page_num # print(page_num) content = process._downloader.downloader_wrapper(url, batch_id, gap, method='post', data=post_form, timeout=timeout, refresh=True ) # print(content) data = json.loads(content) total_page = data['page'] # 从json中读出总页数 drugs = [] drug_url = 'http://www.kmzyw.com.cn/bzjsp/Biz_price_history/price_history_search.jsp?name={}' for row in data['rows']: # print(row['drug_name']) drugs.append(drug_url.format(urllib.quote(str(row['drug_name'])).replace('%', '%25'))) manager.put_urls_enqueue(batch_id, drugs) page_num += 1 return True elif label == 'prd': content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout=timeout, refresh=True ) page = etree.HTML(content) prd = page.xpath("/html/body/section[2]/h1/text()")[0] idx = prd.index(u'品种') prd = prd[:idx] get_logger(batch_id, today_str, '/opt/service/log/').info(prd + " main page") price_hist = page.xpath("/html/head/script[12]/text()")[0] # print(price_hist) data_pat = re.compile(r'series : \[(.*),marker') m = data_pat.findall(price_hist) dics = '' if m: # print(m[0]) data = m[0].split(',marker : { enabled : false ,radius : 3 } ,tooltip : { valueDecimals : 2 }},') for d in data: name = 'name' data = 'data' dic = eval(d + '}') # print(dic) cleaned = {} cleaned['source'] = url cleaned['specs'] = dic['name'] cleaned['name'] = prd cleaned['data'] = [ (timestamp2datetime(int(price[0]) // 1000), price[1]) for price in dic['data'] ] cleaned['access_time'] = datetime.utcnow().isoformat() dics += json.dumps(cleaned, encoding='utf-8') + '\n' else: get_logger(batch_id, today_str, '/opt/service/log/').info('not match') get_logger(batch_id, today_str, '/opt/service/log/').info('start posting prd page to cache') return process._cache.post(url, dics)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('process {}'.format(url)) if not hasattr(process, '_downloader'): headers = {} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_regs'): setattr(process, '_regs', { 'first_letter': re.compile('^[A-Z]$'), 'drug': re.compile('(\d+)') }) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = float(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue if label == 'first_letter': first_letter = url data = { 'Data': '{{\"url\": \"\", \"letter\": \"{}\"}}'.format( first_letter) # form data示例: Data:{"letter":"J","url":""} } query_url = 'http://yaocai.zyctd.com/Ajax/AjaxHandle.ashx?CommandName=common/MCodexService/GetCodexNameByLetter' content = process._downloader.downloader_wrapper(query_url, batch_id, gap, method='post', data=data, timeout=timeout, refresh=True) if not content: return False drug_list = json.loads(content) MBID_list = [] for drug in drug_list[u'Data']: MBID_list.append(str(drug[u'MBID'])) manager.put_urls_enqueue(batch_id, MBID_list) # 每个MBID为一个数字 elif label == 'drug': mbid = url query_url = 'http://www.zyctd.com/Breeds/GetMCodexPoolListByMBID' data = {'mbid': '{}'.format(mbid), 'IsMarket': 'true'} content = process._downloader.downloader_wrapper(query_url, batch_id, gap, method='post', data=data, timeout=timeout, refresh=True) if not content: return False item = json.loads(content) sub_drug_list = item[u'Data'] if not sub_drug_list: # 请求成功然而列表为空,说明这种药材没有价格数据,属正常情况 return True for sub_drug in sub_drug_list: # eg:一个刀豆list里根据不同规格和产地的刀豆会分为不同的sub_drug,拥有不同的MBSID price_history_url = 'http://www.zyctd.com/Breeds/GetPriceTrend' data = {'MBSID': sub_drug['MBSID'], 'IsMarket': 'true'} price_content = process._downloader.downloader_wrapper( price_history_url, batch_id, gap, method='post', data=data, timeout=timeout, refresh=True) if not price_content: return False spec_info = sub_drug['MSpec'].split(' ') productGrade = spec_info[0] if len(spec_info) == 2: productPlaceOfOrigin = spec_info[ 1] # MSpec一般情况示例: 大统 东北 OR 统 较广 else: # MSpec特殊情况示例: 统 productPlaceOfOrigin = '' price_item = json.loads(price_content)[u'Data'] price_data = price_item[ u'PriceChartData'] # 注意price_data是一个字符串,需要再次loads后变为一个列表,每个列表代表一个药市,其中还嵌套着价格列表,价格里时间表示为时间戳等。 if price_data == '[]': # 即使存在这种规格,也有可能会没有价格历史 return True formatted_price_data = deal_with_price(price_data) result_item = { 'name': sub_drug['MName'], 'productGrade': productGrade, 'productPlaceOfOrigin': productPlaceOfOrigin, 'source': 'http://www.zyctd.com/jiage/xq{}.html'.format(mbid), 'access_time': datetime.utcnow().isoformat(), 'price_data': formatted_price_data } if not process._cache.post(str(sub_drug['MBSID']), json.dumps(result_item, ensure_ascii=False), refresh=True): return False return True
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('process {}'.format(url)) if not hasattr(process, '_downloader'): headers = { 'Cookie': 'AJSTAT_ok_times=1; ant_stream_5762b612883d9=1470748235/1519574204; ASP.NET_SessionId=rpdjsrnmq3ybp0f4cnbdewm1; __utmt=1; bow_stream_5762b612883d9=13; __utma=240343830.1666180114.1470705813.1470719553.1470752966.3; __utmb=240343830.6.10.1470752966; __utmc=240343830; __utmz=240343830.1470705813.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 'Host': 'datacenter.cngrain.com', } setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'home': re.compile('http://datacenter.cngrain.com/NewPrice.aspx'), 'market': re.compile( 'http://datacenter.cngrain.com/PriceMainMark.aspx\?MarketId=(.*)' ) }) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_pattern'): setattr( process, '_pattern', { 'market': 'http://datacenter.cngrain.com/PriceMainMark.aspx', 'history': 'http://datacenter.cngrain.com/amline/PriceMarkXml.aspx{}' }) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue if label == 'home': data = { #'__EVENTTARGET':, #最后两位表示页数,其值不能超过11 '__EVENTARGUMENT': '', '__ZIPSTATE': 'H4sIAAAAAAAEAOy9B2AcSZYlJi9tynt/SvVK1+B0oQiAYBMk2JBAEOzBiM3mkuwdaUcjKasqgcplVmVdZhZAzO2dvPfee++999577733ujudTif33/8/XGZkAWz2zkrayZ4hgKrIHz9+fB8/Iv7vX/M3/e1+7V/34bQtd3Z+t5Nq2ebL9mWZTfNvV+Usr3d/t+9+dfL7H9dtMS3z50XTHi+z8ropmt3f7fOffJq12W96Mft1dze/fbL8vM6K5dnyvLJvPf7k1/iNfs3f4Nf6NW982eu6//Jv+uv+2IPd+3s7Bwc7u7/p7Lf5tc7xz6/1a9O/vx79S81+7Nf8NX7T3+bX+QW/AV77vfLr5rf5NX7Bb/j7/65nbb44qdbL9tf6DWe/zW8jL/2avyka/jpv8nftr/vr/Rd/y9/6n//xf+Yv+A1fZJfFRdbmX9Xlr/s73D1bzvJ3L+vqslhO83HWrN79Hi/PZp/t7sxm9LoF8rv8Wr/ur/ef/81/83/2d/15v8uvPfzW3i699WuFb/0Xf/Nf+F/9sX/iprd279Nbv3anr7/hj/7P/4G/n9767Qffopd+nU5Xf/ef8Z//cX/mxq726K1ft/MW0YXfGuzqIb3064Uv/Vf/wN//n/8Nf/DGrh7QW79+Z1R/0h/9X/wFmxHcp7d+g85bf+6f+5//nX/+RrqDgj/WJ8Yfv7mve/RWErz1Y//13/sn/9f/wJ9Nc7bxxV2m4m/62/xav+DX/8miKSZlPgeUXx8f0Yu/8d3n1QXJxhm/osT7rbiTX4s6+R3/8z/pD/vP/9Q/+L/8K//4/+LP+4P+iz/jb/ov/pi//D/7e//O/+zv+oP+q7/yj/I4wXL67/Jr/Da/xu/ya/5av8vstzny2NpM3Q1j/BEf/4iPN/Lxb9jp7m+/6a17YKnfKHzrv/zT/qz/6g/5QzfRHvT4jTv0+OP+zP/s7/nrNr2EWf5Nwpf+6z/yj//P/54/YiN+GNVv2sHvr/gT//O/9+/Z1BVo+Jt18Pu7/0GRk8GXMF2/ee+l/+ov/wc3vfQpvfRbdLjwb/vb//O/8y/Y9BKY8LfsCyT3NDy/6Oq36gn/DW/tgeq/dYfqf/afftNbIOBvE771n/1df8x/8bf/nRvfAlv8th0M/7K/+r/42/7yjW9hXL9dh4R/+T/4X/2ZG9XTHmj423cw/Hv+5BuEaw9K43foUOPP/1NuGhdk5HfsqsI/+D//yzYrUPT1O4WC/J//EX/4f/Wn/Nn/+Z/4l2188YBe/J27gvx3ytAGeQov/YLOS3/G3/Rf/hkbpWsPb7Ht+11+nTlrRjFqv3XUqLE6u6HBbwhL9xvC0v0Gaul+q1/wYy+zi1y8uf1f8Fv//r8rXL3X1bqe5s7N++N+g5k4h1tsqn6r0Db+rkS4//rv/cP+8z/hH/zP/4g/7z/7e/5K6vq/+tv++v/6T/uD/vM/6Y/6z/7+P++//Mv+ARrlL7z7si6m+RfkyH6R1W9llPgtb2mo/8Wf/1f9F3/2n0pqdAYTKqP4bQnmf/kn/NH/5d/8t6bBM4O9lCbE9n/0f/n3/JUz2EL55NfffbCzM94BTzhL1/ni19WPfuy/+NP+DkJUSPPr6Ye/zt4OG1dD7B+zg5vByihq9sMeamYOEoLz6fbOwfbOQ0bvh0W13/y//LP//v/8b/oT/+u/56/Y+i/+7L/hv/6D/pw7Ab3+nj+uQ6+9nf04vfZ2HtyWXtAR3yy9fp0+vX478cD+8z/8jya9SpD+y7/lb/iv/9K/8D//m/8gIlO6kUzl/Kc9Av3Yf/G3/N3/1Z/45/2Xf/Pfv5mR9vcGGMl98V6E+Y18/D3aBOO6FXF+3Z894vz2wjr/5d/yR4cTFUjdp//lH/13/+d/1N9BXjSRrNtu+Am47uGnA1xnv3DEpd7+xD/5h0XcX+9nj7i/w3/xZ/85/+Xf/Pf9l3/V3/tf/U1/Z482G1hx79MBVtz7dO+2rOjrtG+OWr/+zx61flul1gb1v0+M+F/+9T1W3fz4hL330NCvQ1j3xc8RG/4GP4uEFe1HarBHGEtYip/eS7bx+IT99OEAx3560FeeP1TC/tjPHmFvML19sd45GDK95osbxfpnh0hJn0i/haQy/vN/4O+k8Om//Cv+7v/8L/wTiDy/+0byXGXLi+tieeHR6Nf7L/7OP4isi0eY30hoZT935NkdEE73hSXPf/5H/j3/+Z/wh2wmz68nuHuE+W3kk5CNN5DlN/xZI8utfN2e77Z7MGBF3Rc3MpBvFz6cQr9Rn0K/QLjvv/4j/+T/8m/5U0g2/rO/68/5L/68P/q//rP/pP/8D//r/vM/8e8iav34Rmpd/zTly6dxMfvP/r4/67/62/4qX8z2/+s/9E+mb+59+mkX5w1PwHX7Q7bWfnEjTWNC+ff+HRRpR4SSP+8gE6ftb/xzTdvkP/+7/qj/4i/6B3b2tpH28Sg2FEHYL36OKPab/FxT7Deg4Ju+4STK/wfo9Zv+cOkVNZJ/UNdI3huKFOwXP0fU+s361Ppd//O/7y/6z/+Gv+M//xP/0v/8j/iDKOdKngS5WeLg/Od/1x/yn/95yIt+tJFgP/jpSbny6PV1LcMAj0UsA2zn3/On3pQFMQPzSPbb2g97qMUp9pv/cCgGe0yQNgUO/8Vf9EcSq4Xf3/T49L2/M2Al3Bc3+reB5RWcP4S4v8UPhbi/+X/+l/2V/9Xf/EdswUH+U/9g0gY3pJf2H94fyKLYL37YhPot+4T6LWSV6b/88/7g//pP/wf/iz/rLyfiEGV+t82UKd/Nd/fuebT5rf+rP+Zv/i//tL/wP/t7/+AOHsPq7TfY3R/gJP8bS6H/+s/9O/7zP+1vuMmJ47F4FPptZFAheQYIdDCTZYyfFQLdRjIR0n9Q5Dkkme6LHzI5f+ufNXL+l3/PX0KmlMSxh8n/xzgOru2vQ4jITD08eDD69OEBcOYPfu39+ztmmZHXvM/tP78Wr93/5r82VjR+wW94XJbVFS1nUGw1/wW/AdY1Xhc/yH+tX/MX/IayrPGEFjNmF7Tq8RvMfs3f7Lf5NfDfb/jb/Dq/4Dc4aZqTMmuaX/c3yuq2mJb5718WTfsLfp3f/3d9/eTX+rWkK5pBYMMLMVjk//XcjDIN9ncf3jvY0S/dTH8Cn+nv/Tv/iz/9b/vP/v4/9b/44/7g//Kv+ltTuBl/3J/5X/xdf6PYdfr6v/o7/tj//O/7m4kF7sta0LGg8ZNFfiVc8AYrT7+Q/pFufuExL3ztPKT/7u3AG/+1gl5B3Xvbu3vbe/cE419rCOPdPsb/5T/wJxBL/dd/3h/5X/3xf/F/9Tf9oYR6+p//uX8/ZfCROP/D/pD/6m//m/7zP+Yf/C//jL/lZozvWYx33xvjX3sA4wcHm2n8X/69fxtorJqGCUwy8p//3X/4f/HH/HX/+R//R96Wxg8Obo3x7qeC8a8zhPHDHsZjIigt+RCZdcnnL//7/6u//69LhcBimckR/S//3r9eZuG2ZH7w8L2R/nWHkP60T+aApn/uX/Bf/Dl/Rkq5bPwhGAuZ/9a/5D//Y/6qW5P501tjvKOM8esNYfygjzGR+b/+s/9wIjOxB1bI/46/KmBlYpT/7O/66ymeuZkxHI0fvDfGv/4Qxvt9jIWV/+u/9s/6z/7+P54Y+r/8k/6IgJXB5n/R3/ef/X3QI7em8f4tMd7d3j0QjH+DIYzv9zAek7qg9eX//E/+Y/7zP+qv/s//sj9H/MOAzL4quTWZ77830mJa+Fe4eKS1jWn4dX6XH+tp/d8lCbX97/IbDmn6Xye03R/9F3/eX/ef/91/23/+9/wV5AP/l3/aX0VjJFH+r//IP3Hv/t7Bf/Z3/ZH/+Z8E7n88b9vVo7t3r6v1D+bFeLq8qMmsj6fV4u68XZR3dfTNXdhBSvc+vLu7v3//3qdjfBsQObSWD5UOHaQ+Pvgv/rw/anfvv/gz/3IQ/m+gtbm/7b/4M/+O//yv/7P+8z/kr5IZ+S//2r9gPB5/bcz2Ht4Gs1+7j9nWA8Lsv/ij/mTgxYj8p3/QH2wJ91/9g38ugpM/8Y/7z/+Sv+Dro3b/Nqj9On3U7vznf9zf85//hX/hf/UX/3H/xR/6h/+Pf9+fAxo+BAn/7n/wv/rL/8H//M/7m+h7cuk/hGwPdm+D26/7c4Pbzm1w+/XeCzfoqm8Ct09vxW6//s8Nbge3we03eC/cvqk5/fTBbXDzNOVvLb/9Nps861/vd/n157/Lb/Br/Zp44wYNOvt/AgAA//+cf8MQDy8AAA==', '__VIEWSTATE': '', #'__EVENTVALIDATION':, 'ctl00$txtTitle': '', } content = process._downloader.downloader_wrapper( url, batch_id, gap, method='get', timeout=timeout, refresh=True, encoding='utf-8' ) #第一页使用get,得到ZIPSTATE参数,之后在循环内持续利用__EVENTTARGET参数翻页同时更新ZIPSTATE参数 if content == '': return False while 1: dom = lxml.html.fromstring(content) #开始解析当前页 market_suffixes = dom.xpath( '//a[contains(@href,"MarketId")]/@href') if not market_suffixes: get_logger(batch_id, today_str, '/opt/service/log/').info('No market_suffixes') get_logger(batch_id, today_str, '/opt/service/log/').info(content) return False market_suffixes_set = set( market_suffixes) #去重,但是对于市场名重复延续到下一页的情况无效,会重复多爬一次 market_url_list = [ urlparse.urljoin(process._pattern['market'], suffix) for suffix in market_suffixes_set ] manager.put_urls_enqueue( batch_id, market_url_list) #完成本页的处理,将市场名入队,接下去的操作全是为了翻页 page_label = dom.xpath('//td[@colspan="10"]//span')[ 0] #在所有页数里,只有当前页的标签是span,定位到当前页 page = page_label.xpath('.//text()') if page == ['40']: return True next_sibling_list = ( page_label.xpath('./following-sibling::a') ) #定位下一页,下一页不存在时则结束,(即使是网页上的...按钮,在此种判断里也会算存在下一页) if not next_sibling_list: return True next_sibling = next_sibling_list[0] next_raw_js = next_sibling.xpath( './@href' )[0] # 其形式为 : "javascript:__doPostBack('ctl00$ContentPlaceHolder1$DG_FullLatestPrice$ctl24$ctl01','')" eventtarget = re.findall('\(\'(.*)\',', next_raw_js)[0] data['__EVENTTARGET'] = eventtarget last = data[ '__ZIPSTATE'] #用来储存上一次的ZIPSTATE参数,如果新参数失败就换旧的使用——实践过程中发现某页的ZIPSTATE有小概率对下一页失效 data['__ZIPSTATE'] = ( dom.xpath('//input[@name="__ZIPSTATE"]/@value'))[0] data['__EVENTVALIDATION'] = (dom.xpath( '//input[@name="__EVENTVALIDATION"]/@value'))[0] #更新参数 for _ in range(0, 3): #开始对下一页发请求,绝大多数失败都发生在这一步,慎重 try: content = process._downloader.downloader_wrapper( url, batch_id, gap, method='post', timeout=timeout, refresh=True, data=data, encoding='utf-8') if content == '' or 'sent a request that this server could not understand' in content or 'bad request' in content: get_logger( batch_id, today_str, '/opt/service/log/').info('change ZIPSTATE') get_logger( batch_id, today_str, '/opt/service/log/').info('change ZIPSTATE') data[ '__ZIPSTATE'] = last #使用上一次的参数,不考虑连续两次失败,实际调试中也没遇到过 continue except Exception, e: get_logger(batch_id, today_str, '/opt/service/log/').info(e) continue break else: get_logger(batch_id, today_str, '/opt/service/log/').info('failed 3 times') return False elif label == 'market': #开始处理市场页,同时在此处理价格历史,加入到产品信息生成结果 get_logger(batch_id, today_str, '/opt/service/log/').info('in market page') market_id = url[url.find('=') + 1:] url = url.replace(market_id, urllib.quote(market_id)) content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout=timeout, refresh=True, encoding='utf-8', redirect_check=False) dom = lxml.html.fromstring(content) title = dom.xpath('//a[@class="this_tab"]//text()') if title: title = title[0] result = {} result['market'] = title.strip() result['product_list'] = [] table_node = dom.xpath('//table[@class="data_table"]')[0] #得到产品表格 products_nodes = table_node.xpath('.//tr')[1:-1] #去掉表头和尾巴 newest_time = None for product_node in products_nodes: #市场页会有相同产品在不同时间的批次,以此为根据去重 report_time = product_node.xpath('./td[9]/text()') if not newest_time: newest_time = report_time if newest_time != report_time: break relative_path = product_node.xpath('./td[10]/a/@href')[0] history_url = process._pattern['history'].format(relative_path) get_logger(batch_id, today_str, '/opt/service/log/').info( 'The history_url is :{}'.format(history_url)) content = process._downloader.downloader_wrapper( history_url, batch_id, gap, timeout=timeout, refresh=True, encoding='utf-8') if content: #有的价格历史会显示‘数据还在完善中‘ dom_history = lxml.html.fromstring(content) date_list = dom_history.xpath('//series//value/text()') price_list = dom_history.xpath('//graph//value/text()') history_dic = dict(zip(date_list, price_list)) else: history_dic = {} product_item = { 'variety': product_node.xpath('./td[1]/text()')[0].strip(), 'level': product_node.xpath('./td[2]/text()')[0].strip(), 'price_type': product_node.xpath('./td[5]/text()')[0].strip(), 'produce_year': product_node.xpath('./td[6]/text()')[0].strip(), 'produce_area': product_node.xpath('./td[7]/text()')[0].strip(), 'deliver_area': product_node.xpath('./td[8]/text()')[0].strip(), 'source': 'http://datacenter.cngrain.com{}'.format(relative_path), 'access_time': datetime.utcnow().isoformat(), 'price_history': history_dic, } result['product_list'].append(product_item) result['market_source'] = url # print (json.dumps(result, ensure_ascii = False)) return process._cache.post(url, json.dumps(result, ensure_ascii=False))
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): home_page = 'http://qy1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=23&tableName=TABLE23&title=GMP%C8%CF%D6%A4&bcId=118715589530474392063703010776' if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_reg'): setattr( process, '_reg', { 'detail': re.compile( 'http://qy1.sfda.gov.cn/datasearch/face3/content.jsp\?tableId=23&tableName=TABLE23&tableView=GMP%C8%CF%D6%A4&Id=(\d+)' ), }) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) gap = max(gap - other_batch_process_time, 0) today_str = datetime.now().strftime('%Y%m%d') data = { 'tableId': '23', 'State': '1', 'bcId': '118715589530474392063703010776', 'State': '1', 'curstart': '4', 'State': '1', 'tableName': 'TABLE23', 'State': '1', 'viewtitleName': 'COLUMN152', 'State': '1', 'viewsubTitleName': 'COLUMN151', 'State': '1', 'tableView': 'GMP%E8%AE%A4%E8%AF%81', 'State': '1', } if url == home_page: page = 1 while 1: data['curstart'] = page content = process._downloader.downloader_wrapper( 'http://qy1.sfda.gov.cn/datasearch/face3/search.jsp', batch_id, gap, method='post', timeout=timeout, refresh=True, data=data) ids = re.findall(u'GMP认证&Id=(\d+)', content) if not ids: break url_pattern = 'http://qy1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=23&tableName=TABLE23&tableView=GMP%C8%CF%D6%A4&Id={}' urls = [] for drug_id in ids: url = url_pattern.format(drug_id) urls.append(url) manager.put_urls_enqueue(batch_id, urls) page += 1 return True elif process._reg['detail'].match(url): content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout=timeout, ) if content == '': return False dom = lxml.html.fromstring(content) table = dom.xpath('//tr') item = {'source': url, 'access_time': datetime.utcnow().isoformat()} tr_labels = dom.xpath('//tr') for tr_label in tr_labels[1:]: key = ''.join(tr_label.xpath('.//td[1]//text()')).strip() value = ''.join(tr_label.xpath('.//td[2]//text()')).strip() if value and key != u'注': item[key] = value return process._cache.post(url, json.dumps(item, ensure_ascii=False))
def process(url, batch_id, parameter, manager, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers, REGION_NAME)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CacheS3(head + '-json-' + tail)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'entity': re.compile( urlparse.urljoin(SITE, 'cndbpedia/api/entity\?mention=(.+)')), 'avp': re.compile( urlparse.urljoin(SITE, 'cndbpedia/api/entityAVP\?entity=(.+)')), 'info': re.compile( urlparse.urljoin( SITE, 'cndbpedia/api/entityInformation\?entity=(.+)')), 'tags': re.compile( urlparse.urljoin(SITE, 'cndbpedia/api/entityTag\?entity=(.+)')), }) content = process._downloader.downloader_wrapper(url, batch_id, 0, timeout=10, encoding='utf-8') if content == '': return False for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue entity = urllib.unquote(m.group(1)) if label == 'entity': urls = [] avpair_api = urlparse.urljoin(SITE, 'cndbpedia/api/entityAVP?entity={}') info_api = urlparse.urljoin( SITE, 'cndbpedia/api/entityInformation?entity={}') tags_api = urlparse.urljoin(SITE, 'cndbpedia/api/entityTag?entity={}') js = json.loads(content) for ent in js[u'entity']: if isinstance(ent, unicode): ent = ent.encode('utf-8') ent = urllib.quote(ent) urls.append(avpair_api.format(ent)) urls.append(info_api.format(ent)) urls.append(tags_api.format(ent)) return urls else: data = json.dumps({entity: json.loads(content)}) return process._cache.post(url, data)
class Scheduler(object): def __init__(self, cacheserver): self.cache = Cache(BATCH_ID['json'], cacheserver) self.downloader = DownloadWrapper(cacheserver, {'Host': 'zhidao.baidu.com'}) @classmethod def instance(cls, *args): if not hasattr(cls, '_instance'): setattr(cls, '_instance', cls(*args)) return cls._instance def zhidao_results(self, qids, gap, timeout=10): q_jsons = [] for qid in qids: q_json = self.zhidao_question(qid, gap, timeout) if q_json is False: continue q_json['list_answers'] = [] for rid in q_json['answer_ids'][:3]: a_json = self.zhidao_answer(qid, rid, gap, timeout) if a_json is False: continue q_json['list_answers'].append(a_json) q_jsons.append(q_json) return q_jsons def zhidao_question(self, qid, gap, timeout): question_url = 'http://zhidao.baidu.com/question/{}.html'.format(qid) ret = self.downloader.downloader_wrapper(question_url, BATCH_ID['question'], gap, timeout=timeout, encoding='gb18030', error_check=True) if ret is False: return False q_json = generate_question_json(qid, ret) if q_json is None: return False success = self.cache.post(question_url, q_json) return q_json def zhidao_answer(self, qid, rid, gap, timeout): answer_url = ('http://zhidao.baidu.com/question/api/mini?qid={}' '&rid={}&tag=timeliness'.format(qid, rid)) ret = self.downloader.downloader_wrapper(answer_url, BATCH_ID['answer'], gap, timeout=timeout, encoding='gb18030') if ret is False: return False try: a_json = generate_answer_json(ret) except: return False success = self.cache.post(answer_url, a_json) return a_json def zhidao_search(self, qword, batch_id, gap=3, timeout=10, refresh=True): quote_word = urllib.quote(qword.encode('utf-8')) if isinstance( qword, unicode) else urllib.quote(qword) # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8 query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word) ret = self.downloader.downloader_wrapper(query_url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=refresh) # resp.headers: 'content-type': 'text/html;charset=UTF-8', # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False return zhidao_search_questions(ret) def zhidao_search_list_json(self, qword, batch_id, gap=3, timeout=10, refresh=False): quote_word = urllib.quote(qword.encode('utf-8')) if isinstance( qword, unicode) else urllib.quote(qword) # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8 query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word) ret = self.downloader.downloader_wrapper(query_url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=refresh) # resp.headers: 'content-type': 'text/html;charset=UTF-8', # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False search_result_json = parse_search_json_v0615(ret) for item in search_result_json: item["query"] = qword if type(qword) != unicode: item["query"] = qword.decode("utf-8") return search_result_json def zhidao_search_select_best(self, qword, gap=3, timeout=2): search_result_json = self.zhidao_search_list_json( qword, BATCH_ID['search'], gap, timeout) # get the best answer for item in search_result_json: if item["is_recommend"] == 1: return item return False def zhidao_search_select_best_qids(self, qword, gap=3, timeout=2): ret = self.zhidao_search_select_best(qword, gap, timeout) if ret: return [ret["question_id"]] return [] def run(self, qword, gap=3, timeout=10): # qids = self.zhidao_search(qword, BATCH_ID['search'], gap, timeout) qids = self.zhidao_search_select_best_qids(qword, gap, timeout) return self.zhidao_results(qids, gap, timeout)
def __init__(self, cacheserver): self.cache = Cache(BATCH_ID['json'], cacheserver) self.downloader = DownloadWrapper(cacheserver, {'Host': 'zhidao.baidu.com'})
class ZhidaoFetch(): def __init__(self, config={}): self.debug = config.get("debug") self.api_nlp = ZhidaoNlp(self.debug) self.config = config if config: from downloader.downloader_wrapper import DownloadWrapper print self.config self.downloader = DownloadWrapper( self.config.get("cache_server"), self.config["crawl_http_headers"]) def parse_query(self, query_unicode, query_parser=0): if query_parser == 1: qword = u" ".join(self.api_nlp.cut_text(query_unicode)) else: qword = query_unicode return qword def get_search_url_qword(self, query_unicode, query_parser=0, page_number=0): qword = self.parse_query(query_unicode, query_parser=query_parser) if page_number == 0: query_url = "http://zhidao.baidu.com/search/?word={0}".format( urllib.quote(qword.encode("utf-8"))) else: query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format( page_number * 10, urllib.quote(query)) return query_url, qword def select_best_qapair_0616(self, search_result_json): for item in search_result_json: if item["is_recommend"] == 1: #Thread(target = post_zhidao_fetch_job, args = (item, ) ).start() ret["best_qapair"] = item return ret def select_top_n_chat_0621(self, query, search_result_json, num_answers_needed): good_answers = [] bad_answers = [] result_answers = [] match_score_threshold = 0.6 for item in search_result_json: #print type(query), type(item["question"]) discount_skip_word = 0 if self.api_nlp.detect_skip_words(item["question"]): print "did not skip min-gan-ci question" # continue if self.api_nlp.detect_skip_words(item["answers"]): print "did not skip min-gan-ci answers" # continue match_score = difflib.SequenceMatcher(None, query, item["question"]).ratio() item["match_score"] = match_score if self.api_nlp.get_answer_filter_word(item["answers"]): bad_answers.append(item) else: good_answers.append(item) for item in sorted(good_answers, key=lambda elem: 0 - elem["match_score"]): match_score = item["match_score"] if match_score >= match_score_threshold and len( result_answers) < num_answers_needed: result_answers.append(item) else: break if len(result_answers) < num_answers_needed: for item in sorted(bad_answers, key=lambda elem: 0 - elem["match_score"]): match_score = item["match_score"] if match_score >= match_score_threshold and len( result_answers) < num_answers_needed: result_answers.append(item) else: break return result_answers def select_top_n_chat_0622(self, query, search_result_json, result_limit=3, answer_len_limit=30, question_len_limit=20, question_match_limit=0.4): result_answers = [] for item in search_result_json: if "answers" not in item: continue #skip long answers if len(item["answers"]) > answer_len_limit: #print "skip answer_len_limit", type(item["answers"]), len(item["answers"]), item["answers"] continue #too long question if len(item["question"]) > question_len_limit: #print "skip question_len_limit", len(item["question"]) continue if self.api_nlp.filter_chat(item["question"], item["answers"]): continue question_match_score = difflib.SequenceMatcher( None, query, item["question"]).ratio() # question_match_score_b = difflib.SequenceMatcher(None, item["question"], query).ratio() item["match_score"] = question_match_score item["label"] = self.api_nlp.get_chat_label( item["question"], item["answers"]) #skip not matching questions if (question_match_score < question_match_limit): #print "skip question_match_limit", question_match_score continue result_answers.append(item) ret = sorted(result_answers, key=lambda x: 0 - x["match_score"]) if len(ret) > result_limit: ret = ret[:result_limit] return ret def search_chat_top_n(self, query, num_answers_needed=3, query_filter=2, query_parser=0, select_best=True): result = self.prepare_query(query, query_filter, query_parser, use_skip_words=False) if not result: return False ret = result["ret"] query_url = result["query_url"] query_unicode = ret["query"] #if self.api_nlp.is_question_baike( query_unicode , query_filter= query_filter): # print "not skip query, baike", query_filter, query_unicode # return False #print query ts_start = time.time() content = self.download(query_url) ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000) if content: ret["content_len"] = len(content) #print type(content) #print content if select_best and content: ts_start = time.time() search_result = parse_search_json_v0707(content) search_result_json = search_result["results"] ret["milliseconds_parse"] = int((time.time() - ts_start) * 1000) ret["item_len"] = len(search_result_json) answer_items = self.select_top_n_chat_0622(query_unicode, search_result_json, num_answers_needed) #print "select_best", len(answer_items) ret["items"] = answer_items ret["results"] = search_result_json ret["total"] = search_result["total"] # if answer_items: # index = 0 # for item in answer_items: # ret ["qapair{}".format(index)] = item # index += 1 # return ret #print json.dumps(search_result_json,ensure_ascii=False) return ret # def text2bigram(self, text): # ret = set() # if not text: # return ret # text = text.lower() # symbols = list(self.api_nlp.cut_text(text)) # # for i in range(len(symbols)): # if i==0: # word = u'___{}'.format(symbols[i]) # ret.add(word) # word = text[i:i+2] # ret.add(word) # elif i == len(text)-1: # word = u'{}___'.format(symbols[i]) # ret.add(word) # else: # word = u"".join(symbols[i:i+2]) # ret.add(word) # return ret # # def bigram_sim(self, q1, q2): # b1 = self.text2bigram(q1) # b2 = self.text2bigram(q2) # b1 = set(self.api_nlp.cut_text(q1.lower())) # b2 = set(self.api_nlp.cut_text(q2.lower())) # b1d = set(b1) # b1d.difference_update(b2) # # sim = 1.0 * len(b1.intersection(b2))/ len(b1.union(b2)) # return sim def sim(self, q1, q2): q1 = self.api_nlp.clean_question(q1) q2 = self.api_nlp.clean_question(q2) match_score = difflib.SequenceMatcher(None, q1, q2).ratio() return match_score def select_best_qapair_0630(self, query, search_result_json, question_len_max=30, answer_len_max=90, answer_len_min=2): best_item = None best_score = 0.6 best_cnt_like = -1 used_skip_sources = list() for item in search_result_json: print json.dumps(item, ensure_ascii=False) print "\n\n--------select_best_qapair_0630 " if item["source"] in ["muzhi"]: used_skip_sources.append(item["source"]) item["debug_note"] = u"[-]问答对-来自拇指" continue #match_score = self.bigram_sim(query, item["question"]) match_score = self.sim(query, item["question"]) item["match_score"] = match_score #print type(query), type(item["question"]) temp = self.api_nlp.detect_skip_words(item["question"]) if temp: print "skip min-gan-ci question", json.dumps( list(temp), ensure_ascii=False) item["debug_note"] = u"[-]问答对-问题敏感词:{}".format(u"/".join(temp)) continue temp = self.api_nlp.detect_skip_words( item["answers"], check_list=["skip_words_zhidao", "skip_words_all"]) if temp: print "skip min-gan-ci answers", json.dumps(list(temp), ensure_ascii=False) item["debug_note"] = u"[-]问答对-答案敏感词:{}".format(u"/".join(temp)) continue #too long question #if len(item["question"]) > question_len_max: # item["debug_note"]= u"[-]问题长度过长:{}".format(len(item["question"]) ) # continue if len(item["answers"]) < answer_len_min: item["debug_note"] = u"[-]答案长度过短:{}".format( len(item["answers"])) continue filter_word = self.api_nlp.get_answer_filter_word(item["answers"]) if filter_word: print "skip bad answers" item["debug_note"] = u"[-]问答对-答案有符号:{}".format(filter_word) continue if self.api_nlp.debug: print match_score, item["answers"] #print query, item["question"] ,match_score, item["cnt_like"] this_answer_is_better = False if item["source"] == "baike": item["debug_note"] = u"[+]问答对-使用百科" this_answer_is_better = True elif not best_item or best_item["source"] != "baike": #skip long answers #if len(item["answers"]) > answer_len_max and item["cnt_like"] < 50: # item["debug_note"]= u"[-]答案长度过长:{}".format(len(item["answers"]) ) # continue if match_score > best_score and item[ "cnt_like"] >= best_cnt_like * 0.2: this_answer_is_better = True elif match_score > best_score * 0.95 and item[ "cnt_like"] > best_cnt_like * 1.5 + 2: this_answer_is_better = True if this_answer_is_better: best_item = item best_score = max(match_score, best_score) best_cnt_like = item["cnt_like"] if not item.get("debug_note"): item["debug_note"] = u"[?]问答对-maybe best={}".format( best_score) else: if not item.get("debug_note"): item["debug_note"] = u"[-]问答对-低于best={}".format(best_score) if best_item and best_item["source"] not in [ "baike" ] and len(used_skip_sources) >= 4: if best_item: best_item["debug_note"] += u"--规避医疗类问题{}".format( "/".join(used_skip_sources)) #母婴类,医疗类问题不能给出答案,要专业人士做这件事 return None return best_item def search_baike_best(self, query, query_filter=2, query_parser=0, debug_item=None, keep_result=False): query_unicode = query if not isinstance(query, unicode): query_unicode = query.decode("utf-8") query_unicode = self.api_nlp.rewrite_zhidao_query(query_unicode) result = self.prepare_query(query_unicode, query_filter, query_parser, debug_item=debug_item) if not result: return False ret = result["ret"] result["query"] = query query_url = result["query_url"] if not self.api_nlp.is_question_baike(query_unicode, query_filter=query_filter, debug_item=debug_item): print "skip query, not baike", query_filter, query_unicode return False ts_start = time.time() content = self.download(query_url) ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000) if content: ts_start = time.time() search_result = parse_search_json_v0707(content) search_result_json = search_result["results"] ret["total"] = search_result["total"] ret["milliseconds_parse"] = int((time.time() - ts_start) * 1000) if keep_result or self.debug: ret["results"] = search_result_json best_item = self.select_best_qapair_0630(query_unicode, search_result_json) if best_item: ret["best_qapair"] = best_item return ret #print json.dumps(search_result_json,ensure_ascii=False) #print ">>>>>>", content return ret def search_all(self, query, query_filter=0, query_parser=0, limit=10): max_page_number = (limit - 1) / 10 + 1 output = { "items": [], "metadata": [], "query": query, "limit": limit, "query_filter": query_filter, "query_parser": query_parser } for page_number in range(max_page_number): result = self.prepare_query(query, query_filter, query_parser, use_skip_words=False) if not result: print query break ret = result["ret"] query_url = result["query_url"] query_unicode = ret["query"] ts_start = time.time() content = self.download(query_url) ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000) if content: ts_start = time.time() search_result = parse_search_json_v0707(content) ret["milliseconds_parse"] = int( (time.time() - ts_start) * 1000) output["items"].extend(search_result["results"]) output["metadata"].extend(ret) output["total"] = search_result["total"] return output def prepare_query(self, query, query_filter, query_parser, use_skip_words=True, page_number=0, debug_item=None): if not query: print "skip query, empty" if debug_item is not None: debug_item["debug_note"] = u"[-]问题空:prepare_query" return False query_unicode = query if not isinstance(query_unicode, unicode): query_unicode = query_unicode.decode("utf-8") if use_skip_words: detected_words = self.api_nlp.detect_skip_words(query_unicode) if detected_words: print "skip bad query, empty", u"/".join(detected_words) if debug_item is not None: debug_item["debug_note"] = u"[-]问题敏感词:{}".format( u"/".join(detected_words)) return False query_unicode = re.sub(u"?$", "", query_unicode) query_url, qword = self.get_search_url_qword(query_unicode, query_parser, page_number=page_number) ret = { "query": query_unicode, } if query_parser == 1: ret["qword"] = qword return {"ret": ret, "query_url": query_url} def search_chat_best(self, query, query_filter=2, query_parser=0): result = self.prepare_query(query, query_filter, query_parser) if not result: return False ret = result["ret"] query_url = result["query_url"] query_unicode = ret["query"] if not self.api_nlp.is_question_baike(query_unicode, query_filter=query_filter): print "skip query, not baike", query_filter, query_unicode return False ts_start = time.time() content = self.download(query_url) ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000) if content: ts_start = time.time() search_result = parse_search_json_v0707(content) search_result_json = search_result["results"] ret["total"] = search_result["total"] ret["milliseconds_parse"] = int((time.time() - ts_start) * 1000) #deprecated best_item = self.select_best_chat_0621(query_unicode, search_result_json) if best_item: ret["best_qapair"] = best_item return ret #print json.dumps(search_result_json,ensure_ascii=False) return False def download(self, query_url): if self.config: return self.downloader.download_with_cache( query_url, self.config["batch_id"], self.config["crawl_gap"], self.config["crawl_http_method"], self.config["crawl_timeout"], encoding='gb18030', redirect_check=True, error_check=False, refresh=False) else: return self.download_direct(query_url) def download_direct(self, query_url): import requests #print query_url encoding = 'gb18030' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,en-US;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': 1, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36', } headers["Host"] = "zhidao.baidu.com" print query_url r = requests.get(query_url, timeout=10, headers=headers) if r: r.encoding = encoding return r.text
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} cookie = kwargs.get('cookie', None) # cookie = "gr_user_id=0fceb70d-e0ab-4c16-8f21-d49b5d242b0e; PHPSESSID=ltro2cjbvonlg6mu4hupe7dcv1; CNZZDATA1254842228=371101890-1469690209-null%7C1472547698" if cookie: headers.update({'Cookie': cookie}) setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'search': re.compile( urlparse.urljoin(SITE, 'search\?key=(.+?)&index=(\d+)&p=(\d+)')), 'detail': re.compile( urlparse.urljoin( SITE, 'company_getinfos\?unique=(.+?)&companyname=(.+?)&tab=base' )), 'invest': re.compile( urlparse.urljoin( SITE, 'company_getinfos\?unique=(.+?)&companyname=(.+?)(?:&p=(\d+))?&tab=touzi(?:&box=touzi)?' )), }) method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start download') def reformat(info): # 将info按企查查页面顺序插入队列 temp = info['info'] del info['info'] info['info'] = [] info['info'].append(("统一社会信用码", temp['unified_social_credit_code'])) info['info'].append(("注册号", temp['registration_id'])) info['info'].append(("组织机构代码", temp['organization_code'])) info['info'].append(("经营状态", temp['status'])) info['info'].append(("公司类型", temp['business_type'])) info['info'].append(("成立日期", temp['begin'])) info['info'].append(("法定代表", temp['legal_person'])) info['info'].append(("注册资本", temp['registered_capital'])) info['info'].append(("营业期限", temp['end'])) info['info'].append(("登记机关", temp['registration_authority'])) info['info'].append(("发照日期", temp['approval_date'])) info['info'].append(("企业地址", temp['address'])) info['info'].append(("经营范围", temp['business_scope'])) return info def parse_company_investment(tree): # 解析对外投资页面,将子公司存入sub_companies字段下 invest_dict = {'sub_companies': []} for sub_company in tree.cssselect('.list-group a.list-group-item'): sub_name = sub_company.cssselect( 'span.clear .text-lg')[0].text_content().strip() href = sub_company.get('href') province, key_num = href.rsplit('_', 2)[-2:] invest_dict['sub_companies'].append({ 'name': sub_name, 'key_num': key_num, 'province': province, 'href': href, }) return invest_dict content = process._downloader.downloader_wrapper(url, batch_id, gap, method, timeout=timeout, encoding='utf-8') # print(url, file=log_file) cookie = kwargs.get('cookie', None) if not cookie: get_logger(batch_id, today_str, '/opt/service/log/').info("No cookie in worker") return False if content == '': get_logger(batch_id, today_str, '/opt/service/log/').info("no content") content = requests.get(url, cookies={1: cookie}).text if content: # print("got content", file=log_file) get_logger(batch_id, today_str, '/opt/service/log/').info('got content') if not content and url.endswith("tab=touzi&box=touzi"): get_logger(batch_id, today_str, '/opt/service/log/').info("void invest page") return True invest_pat = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&p={p}&tab=touzi&box=touzi" main_pat = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&tab=base" search_pat = "http://www.qichacha.com/search?key={name}&index=0&p={p}" parser = QiParser() tree = lxml.html.fromstring( content.replace('<em>', '').replace('</em>', '')) # if kwargs and kwargs.get("debug"): # print('start parsing url') for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue if label == 'search': # 搜索页面解析 comp_name = urllib.unquote(m.group(1)) dic = {'search_name': comp_name, 'names': []} urls = [] if tree.cssselect('.table-search-list') and tree.cssselect( '.tp2_tit a'): items = tree.cssselect('.table-search-list') for idx, i in enumerate(items): if not i.xpath('.//*[@class=\"tp2_tit clear\"]/a/text()'): continue item = {} item['name'] = i.xpath( './/*[@class=\"tp2_tit clear\"]/a/text()')[0] # print(item['name'], file=log_file) item['href'] = i.xpath( './/*[@class=\"tp2_tit clear\"]/a/@href')[0] item['status'] = i.xpath( './/*[@class=\"tp5 text-center\"]/a/span/text()')[0] item['key_num'] = item['href'].split('firm_')[1].split( '.shtml')[0] # print(item['key_num'], file=log_file) if idx == 0 and comp_name == item[ 'name']: # 若第一个搜索结果完全匹配则只添加第一个结果入待爬取队列 # get_logger(batch_id, today_str, '/opt/service/log/').info('appending', item['name']) urls.append( main_pat.format(key_num=item['key_num'], name=item['name'])) urls.append( invest_pat.format(key_num=item['key_num'], name=item['name'], p='1')) break elif idx < 3: # 如果第一个不完全匹配, 将前三个搜索结果加入待爬取队列 urls.append( main_pat.format(key_num=item['key_num'], name=item['name'])) urls.append( invest_pat.format(key_num=item['key_num'], name=item['name'], p='1')) dic['names'].append(item['name']) if not urls: return True manager.put_urls_enqueue(batch_id, urls) if not dic['names']: return True else: # 不完全匹配时将search_name与前三个搜索结果存入json用作别名映射 data = json.dumps(dic, encoding='utf-8', ensure_ascii=False) return process._cache.post(url, data) elif label == 'detail': # 解析详情页面 comp_name = urllib.unquote(m.group(2)) # print(comp_name, 'detail', file=log_file) all_info = parser.parse_detail(tree) all_info['name'] = comp_name all_info['source'] = url all_info['access_time'] = datetime.utcnow().isoformat() all_info = parser.parser_patch(tree, all_info) all_info = reformat(all_info) data = json.dumps(all_info, encoding='utf-8', ensure_ascii=False) get_logger(batch_id, today_str, '/opt/service/log/').info(data) if not any([i[1] for i in all_info['info']]): return False return process._cache.post(url, data) else: # 解析投资页面 comp_name = urllib.unquote(m.group(2)) key_num = m.group(1) page = int(m.group(3)) pages = tree.xpath(".//a[@id=\"ajaxpage\"]/text()") if '>' in pages: urls = [ invest_pat.format(key_num=key_num, name=comp_name, p=str(page + 1)) ] manager.put_urls_enqueue(batch_id, urls) invest_dict = parse_company_investment(tree) # print(invest_dict, file=log_file) if not invest_dict['sub_companies']: return True invest_dict['name'] = comp_name invest_dict['source'] = url invest_dict['access_time'] = datetime.utcnow().isoformat() data = json.dumps(invest_dict, encoding='utf-8', ensure_ascii=False) get_logger(batch_id, today_str, '/opt/service/log/').info(data) return process._cache.post(url, data)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'main': re.compile( r'http://jiage.cngold.org/jinshubi/list_3640_(\d+).html'), 'info': re.compile( r'http://jiage.cngold.org/c/(\d+-\d+-\d+)/c(\d+).html'), 'index': re.compile(r'http://jiage.cngold.org/jinshubi/index.html') }) method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') if url == 'http://jiage.cngold.org/jinshubi/list_3640_1.html': url = 'http://jiage.cngold.org/jinshubi/index.html' # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start download') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout) # print(content) if content == '': get_logger(batch_id, today_str, '/opt/service/log/').info(url + ' no content') return False for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: # print("not match") continue page = etree.HTML(content) if label == 'index': prices = page.xpath(".//ul[@class='list_baojia']/li/a/@href") # get_logger(batch_id, today_str, '/opt/service/log/').info(str(prices)) manager.put_urls_enqueue(batch_id, prices[:3]) return True elif label == 'info': dic = {} datestr = m.group(1) table = page.xpath(".//table//td/text()") table = [t.strip() for t in table] dic[u'产品名称'] = table[0] dic[u'产品价格'] = table[1] dic[u'价格单位'] = table[2] dic[u'涨跌'] = table[3] dic[u'日期'] = datestr dic[u'source'] = url dic[u'access_time'] = datetime.utcnow().isoformat() data = json.dumps(dic, ensure_ascii=False) # get_logger(batch_id, today_str, '/opt/service/log/').info(data) return process._cache.post(url, data)
# -*- coding: utf-8 -*- import scrapy import sys from downloader.downloader_wrapper import DownloadWrapper reload(sys) sys.setdefaultencoding('utf-8') BATCH_ID = 'dongfang-201606test' url = 'http://data.eastmoney.com/Notice' SERVER = 'http://192.168.1.179:8000/' m = DownloadWrapper(SERVER) #content = m.downloader_wrapper('http://data.eastmoney.com/Notice/Noticelist.aspx',BATCH_ID,0,encoding='gb2312',refresh=True) #print content class MyMiddleWare(object): def process_request(self, request, spider): url = request.url m = DownloadWrapper(SERVER) content = m.downloader_wrapper(url, BATCH_ID, 3, encoding='gb2312') if content: response = scrapy.http.response.html.HtmlResponse(url, encoding='utf-8', body=content) return response return ''' m=Cache(BATCH_ID) print m.post('test','content3')
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'main': re.compile( r'http://chem.100ppi.com/price/plist-(\d+)(-{1,3})(\d+).html' ), 'prd': re.compile(r'http://www.100ppi.com/price/detail-(\d+).html') }) def safe_state(statement): return statement[0] if statement else '' method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') # print(url) # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start download') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout) # print(content) if content == '': get_logger(batch_id, today_str, '/opt/service/log/').info(url + ' no content') return False # content.encoding='gb18030' # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue page = etree.HTML(content) if label == 'main': # print("adding chem prds") prd_links = page.xpath('//table/tr/td[1]/div/a/@href') if not prd_links: # print('end of pages') get_logger(batch_id, today_str, '/opt/service/log/').info('end of pages') return True next_pat = re.compile(r'plist-(\d+)(-{1,3})(\d+).html') current = next_pat.search(url) current = str(int(current.group(3)) + 1) next_page = url[:url.rfind('-') + 1] + current + '.html' prd_links.append(urlparse.urljoin(SITE, next_page)) get_logger(batch_id, today_str, '/opt/service/log/').info('||'.join(prd_links) + ' added to queue') manager.put_urls_enqueue(batch_id, prd_links) return True else: data = {} data['name'] = page.xpath( "/html/body/div[8]/div[1]/span[2]/text()")[0] # print(data['name'], 'prd page') data['source'] = url # data['prd_header'] = page.xpath("//div[@class=\"mb20\"]/table/tr/th/text()") # data['prd_infos'] = page.xpath("//div[@class=\"mb20\"]/table/tr/td/text()") prd_header = page.xpath( "/html/body/div[8]/div[2]/div[1]/div[1]/h3/text()")[0] idx_left, idx_right = prd_header.find(u'('), prd_header.find(u')') data[u'报价类型'] = prd_header[idx_left + 1:idx_right] data[u'报价机构'] = page.xpath( "/html/body/div[8]/div[2]/div[2]/div[2]/table/tr[1]/td/h3/text()" )[0].strip() data[u'商品报价'] = safe_state( page.xpath("//div[@class=\"mb20\"]/table/tr[1]/td[1]/text()")) data[u'发布时间'] = safe_state( page.xpath("//div[@class=\"mb20\"]/table/tr[1]/td[2]/text()")) data[u'出产地'] = safe_state( page.xpath("//div[@class=\"mb20\"]/table/tr[2]/td[1]/text()")) data[u'有效期'] = safe_state( page.xpath("//div[@class=\"mb20\"]/table/tr[2]/td[2]/text()")) data[u'仓储地'] = safe_state( page.xpath("//div[@class=\"mb20\"]/table/tr[3]/td[1]/text()")) data[u'包装说明'] = safe_state( page.xpath("//div[@class=\"mb20\"]/table/tr[3]/td[2]/text()")) data[u'生产厂家'] = safe_state( page.xpath( "/html/body/div[8]/div[2]/div[1]/div[2]/div/div[2]/text()") ) info = {} table_header = page.xpath( "//table[@class=\"mb20 st2-table tac\"]/tr/th/text()") table_content = page.xpath( "//table[@class=\"mb20 st2-table tac\"]/tr/td/text()") for header, cont in zip(table_header, table_content): info[header] = cont data[u'详细信息'] = info contact = {} contact[u'联系人'] = safe_state( page.xpath( "//div[@class=\"connect\"]/table/tr[2]/td[2]/text()")) contact[u'电话'] = safe_state( page.xpath( "//div[@class=\"connect\"]/table/tr[3]/td[2]/text()")) contact[u'传真'] = safe_state( page.xpath( "//div[@class=\"connect\"]/table/tr[4]/td[2]/text()")) contact[u'邮件'] = safe_state( page.xpath( "//div[@class=\"connect\"]/table/tr[5]/td[2]/text()")) contact[u'手机'] = safe_state( page.xpath( "//div[@class=\"connect\"]/table/tr[6]/td[2]/text()")) contact[u'地址'] = safe_state( page.xpath( "//div[@class=\"connect\"]/table/tr[7]/td[2]/text()")) contact[u'网址'] = safe_state( page.xpath( "//div[@class=\"connect\"]/table/tr[8]/td[2]/text()")) data[u'联系方式'] = contact # print(json.dumps(data, encoding='utf-8', ensure_ascii=False)) dics = json.dumps(data, encoding='utf-8', ensure_ascii=False) get_logger(batch_id, today_str, '/opt/service/log/').info(dics + ' saved to S3') return process._cache.post(url, dics)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info('process {}'.format(url)) if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'list_view': re.compile( 'http://www.yt1998.com/price/nowDayPriceQ\!getPriceList.do\?pageIndex=(\d+)&pageSize=(\d+)' ), 'detail_view': re.compile( 'http://www.yt1998.com/ytw/second/priceInMarket/getPriceHistory.jsp\?ycnam=(.*)&guige=(.*)&chandi=(.*)&market=(.*)' ) }) # http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex=0&pageSize=20 if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout = int(timeout) for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue get_logger(batch_id, today_str, '/opt/service/log/').info('label : {}'.format(label)) if label == 'list_view': get_logger(batch_id, today_str, '/opt/service/log/').info(label) content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='utf-8', refresh=True) get_logger(batch_id, today_str, '/opt/service/log/').info('download ok') list_item = json.loads(content) for detail_item in list_item[u'data']: detail_item[u'access_time'] = datetime.utcnow().isoformat() total_num = int(list_item[u'total']) pageIndex = int(m.group(1)) pageSize = int(m.group(2)) if pageIndex == 0: for index in range(1, total_num // pageSize + 1): get_logger(batch_id, today_str, '/opt/service/log/').info('index:') get_logger(batch_id, today_str, '/opt/service/log/').info(index) list_pattern = 'http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex={}&pageSize={}' list_url = list_pattern.format(index, pageSize) manager.put_urls_enqueue(batch_id, [list_url]) return process._cache.post(url, json.dumps(list_item, ensure_ascii=False), refresh=True)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'main': re.compile( r'http://www.sge.com.cn/xqzx/mrxq/index_(\d+).shtml'), 'info': re.compile(r'http://www.sge.com.cn/xqzx/mrxq/(\d+).shtml') }) method, gap, js, timeout, data = parameter.split(':') gap = float(max(0, float(gap) - other_batch_process_time)) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') # print(url) # if kwargs and kwargs.get("debug"): # get_logger(batch_id, today_str, '/opt/service/log/').info('start download') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout) # print(content) if content == '': get_logger(batch_id, today_str, '/opt/service/log/').info(url + ' no content') return False for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: # print("not match") continue page = etree.HTML(content) if label == 'main': get_logger(batch_id, today_str, '/opt/service/log/').info('in list page') urls = page.xpath(".//ul[@id='zl_list']/li/a/@href") urls = [urlparse.urljoin(SITE, list_url) for list_url in urls] get_logger(batch_id, today_str, '/opt/service/log/').info(str(urls)) # get_logger(batch_id, today_str, '/opt/service/log/').info('||'.join(prd_links) + ' added to queue') manager.put_urls_enqueue(batch_id, urls) return True elif label == 'info': dic = {} date = page.xpath(".//h5[@class='con_h5']/text()")[0].split( u'\xa0')[0] header = page.xpath( ".//div[@id='page_con']/table/tbody/tr[1]/td//text()") infos = page.xpath( ".//div[@id='page_con']/table/tbody/tr/td[1]//text()") infos = [info.strip() for info in infos if info.strip()] idx = -1 for index, prod in enumerate(list(infos)): if u"Pt9995" in prod: idx = str(index + 1) break if idx == -1: return True pt_infos = page.xpath( ".//div[@id='page_con']/table/tbody/tr[{}]/td//text()".format( idx)) if not pt_infos: get_logger( batch_id, today_str, '/opt/service/log/').info("No pt info on this page " + url) return True for col, value in zip(header, pt_infos): dic[col] = value.strip() dic[u'日期'] = date dic[u'source'] = url dic[u'access_time'] = datetime.utcnow().isoformat() data = json.dumps(dic, ensure_ascii=False) get_logger(batch_id, today_str, '/opt/service/log/').info(data) return process._cache.post(url, data)
def process(url, batch_id, parameter, manager, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper('s3', headers, REGION_NAME)) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CacheS3(head + '-json-' + tail)) if not hasattr(process, '_regs'): setattr( process, '_regs', { 'entity': re.compile( urlparse.urljoin(SITE, 'cndbpedia/api/entity\?mention=(.+)')), 'avp': re.compile( urlparse.urljoin(SITE, 'cndbpedia/api/entityAVP\?entity=(.+)')), 'info': re.compile( urlparse.urljoin( SITE, 'cndbpedia/api/entityInformation\?entity=(.+)')), 'tags': re.compile( urlparse.urljoin(SITE, 'cndbpedia/api/entityTag\?entity=(.+)')), }) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) today_str = datetime.now().strftime('%Y%m%d') if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start download') content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout, encoding='utf-8') if content == '': return False if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url') for label, reg in process._regs.iteritems(): m = reg.match(url) if not m: continue entity = urllib.unquote(m.group(1)) if label == 'entity': urls = [] avpair_api = urlparse.urljoin(SITE, 'cndbpedia/api/entityAVP?entity={}') info_api = urlparse.urljoin( SITE, 'cndbpedia/api/entityInformation?entity={}') tags_api = urlparse.urljoin(SITE, 'cndbpedia/api/entityTag?entity={}') js = json.loads(content) for ent in js[u'entity']: if isinstance(ent, unicode): ent = ent.encode('utf-8') ent = urllib.quote(ent) urls.append(avpair_api.format(ent)) urls.append(info_api.format(ent)) urls.append(tags_api.format(ent)) manager.put_urls_enqueue(batch_id, urls) return True else: data = json.dumps({entity: json.loads(content)}) if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info( 'start post {} json'.format(label)) return process._cache.post(url, data)
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs): today_str = datetime.now().strftime('%Y%m%d') get_logger(batch_id, today_str, '/opt/service/log/').info(url) home_page = 'http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=36&tableName=TABLE36&title=%BD%F8%BF%DA%D2%A9%C6%B7&bcId=124356651564146415214424405468' if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) headers = {'Host': domain_name} setattr(process, '_downloader', DownloadWrapper(None, headers)) if not hasattr(process,'_reg'): setattr(process, '_reg', { 'detail': re.compile('http://app1.sfda.gov.cn/datasearch/face3/content.jsp\?tableId=36&tableName=TABLE36&Id=(\d+)'), }) if not hasattr(process, '_cache'): head, tail = batch_id.split('-') setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout= int(timeout) gap = max(gap - other_batch_process_time, 0) #if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('start download') data = { 'tableId' : '36', 'State' : '1', 'bcId' : '124356651564146415214424405468', 'State' : '1', 'State' : '1', 'tableName' : 'TABLE36', 'State' : '1', 'viewtitleName' : 'COLUMN361', 'State' : '1', 'viewsubTitleName' : 'COLUMN354,COLUMN355,COLUMN356,COLUMN823', 'curstart':'2', 'State' : '1', 'State' : '1', } if url == home_page: #if kwargs and kwargs.get("debug"): page = 1 while 1 : time.sleep(gap) get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url at page {}'.format(page)) data['curstart'] = page content = process._downloader.downloader_wrapper('http://app1.sfda.gov.cn/datasearch/face3/search.jsp', batch_id, gap, method = 'post', timeout = timeout, refresh = True, data = data, encoding = 'utf-8' ) #get_logger(batch_id, today_str, '/opt/service/log/').info(content) ids = re.findall(u'进口药品&Id=(\d+)', content) get_logger(batch_id, today_str, '/opt/service/log/').info(ids) if not ids: get_logger(batch_id, today_str, '/opt/service/log/').info('End at {} pages'.format(page)) break # if page == 3: # break get_logger(batch_id, today_str, '/opt/service/log/').info('ids : {}'.format(ids)) url_pattern = 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=36&tableName=TABLE36&Id={}' urls = [] for drug_id in ids: url = url_pattern.format(drug_id) urls.append(url) manager.put_urls_enqueue(batch_id, urls) page += 1 get_logger(batch_id, today_str, '/opt/service/log/').info('going to page{}'.format(page)) return True elif process._reg['detail'].match(url): content = process._downloader.downloader_wrapper( url, batch_id, gap, timeout = timeout, refresh = True ) if content == '': return False dom = lxml.html.fromstring(content) table = dom.xpath('//tr') item = { 'license_number': table[1].xpath('./td')[1].xpath('./text()'), # [u'注册证号'] 'old_license_number': table[2].xpath('./td')[1].xpath('./text()'), # [u'原注册证号'] 'packaging_license_number': table[4].xpath('./td')[1].xpath('./text()'), # [u'分包装批准文号'] 'company_chs': table[5].xpath('./td')[1].xpath('./text()'), # [u'公司名称(中文)'] 'company_eng': table[6].xpath('./td')[1].xpath('./text()'), # [u'公司名称(英文)'] 'product_name_chs': table[11].xpath('./td')[1].xpath('./text()'), # [u'产品名称(中文)'] 'product_name_eng': table[12].xpath('./td')[1].xpath('./text()'), # [u'产品名称(英文)'] 'commodity_name_chs': table[13].xpath('./td')[1].xpath('./text()'), # [u'商品名(中文)'] 'commodity_name_eng': table[14].xpath('./td')[1].xpath('./text()'), # [u'商品名(英文)'] 'drug_form': table[15].xpath('./td')[1].xpath('./text()'), # [u'剂型(中文)'] 'specification': table[16].xpath('./td')[1].xpath('./text()'), # [u'规格(中文)'] 'dosage': table[17].xpath('./td')[1].xpath('./text()'), # [u'包装规格(中文)'] 'manufacturer_chs': table[18].xpath('./td')[1].xpath('./text()'), # [u'生产厂商(中文)'] 'manufacturer_eng': table[19].xpath('./td')[1].xpath('./text()'), # [u'生产厂商(英文)'] 'manuf_address_chs': table[20].xpath('./td')[1].xpath('./text()'), # [u'厂商地址(中文)'] 'manuf_address_eng': table[21].xpath('./td')[1].xpath('./text()'), # [u'厂商地址(英文)'] 'manuf_country_chs': table[22].xpath('./td')[1].xpath('./text()'), # [u'厂商国家/地区(中文)'] 'manuf_country_eng': table[23].xpath('./td')[1].xpath('./text()'), # [u'厂商国家/地区(英文)'] 'packaging_company_name': table[26].xpath('./td')[1].xpath('./text()'), # [u'分包装企业名称'] 'packaging_company_address': table[27].xpath('./td')[1].xpath('./text()'), # [u'分包装企业地址'] 'category': table[31].xpath('./td')[1].xpath('./text()'), # [u'产品类别'] 'standard_code': table[32].xpath('./td')[1].xpath('./text()'), # [u'药品本位码'] 'source' : [url], #设为list格式与之前字段统一,在下面的循环里一并取出 } for k,v in item.iteritems(): if len(v) > 0: item[k] = v[0] else : item[k] = None return process._cache.post(url, json.dumps(item, ensure_ascii = False))