示例#1
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    print(url)
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader',
                DownloadWrapper(None, headers, REGION_NAME))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'homepage':
                re.compile('http://www.sfda.gov.cn/WS01/(.*?)/$'),
                'detail':
                re.compile('http://www.sfda.gov.cn/WS01/(.*?)/(.*?).html')
            })
    if not hasattr(process, '_cache'):
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)
    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        print label
        if label == 'homepage':
            content = process._downloader.downloader_wrapper(url,
                                                             batch_id,
                                                             gap,
                                                             timeout=timeout,
                                                             refresh=True)
            dom = lxml.html.fromstring(content)
            total_content = dom.xpath('//td[@class="pageTdSTR15"]//text()')[0]
            total_page = int(re.findall(u'共(\d+)页', total_content)[0])

            for page in range(2, total_page):
                print(page)
                hrefs = dom.xpath('//td[@class="ListColumnClass15"]/a/@href')
                urls = []
                for href in hrefs:
                    href = re.sub(u'\.\.', u'', href)  # 网址是以..开头的相对路径
                    href = 'http://www.sfda.gov.cn/WS01' + href
                    urls.append(href)
                    manager.put_urls_enqueue(batch_id, urls)
                page_url = '{}index_{}.html'.format(url, page)
                content = process._downloader.downloader_wrapper(
                    page_url, batch_id, gap, timeout=timeout, refresh=True)
                dom = lxml.html.fromstring(content)
            return True
        elif label == 'detail':
            return parse_page(url)
示例#2
0
def process(url, batch_id, parameter, manager, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader',
                DownloadWrapper(None, headers, REGION_NAME))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CacheS3(head + '-json-' + tail))

    if not hasattr(process, '_regs'):
        setattr(process, '_regs',
                re.compile(urlparse.urljoin(SITE, 'search\?word=(.+)')))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)

    today_str = datetime.now().strftime('%Y%m%d')
    word = urllib.unquote(process._regs.match(url).group(1))

    if kwargs and kwargs.get("debug"):
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('start download')

    refresh = False
    for _ in range(5):
        try:
            content = process._downloader.downloader_wrapper(
                url,
                batch_id,
                gap,
                timeout=timeout,
                encoding='gb18030',
                refresh=refresh)

            if content == '':
                return False

            if kwargs and kwargs.get("debug"):
                get_logger(batch_id, today_str,
                           '/opt/service/log/').info('start parsing url')

            result = parse_search_json_v0707(content, word)
            break
        except:
            refresh = True

    if kwargs and kwargs.get("debug"):
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('start post json')

    return process._cache.post(url, json.dumps(result))
示例#3
0
def process(url, batch_id, parameter, manager, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader',
                DownloadWrapper('s3', headers, REGION_NAME))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)

    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str, '/opt/service/log/').info('start download')
    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     timeout=timeout,
                                                     encoding='gb18030')

    if kwargs and kwargs.get("debug"):
        print(len(content), "\n", content[:1000])

    if content is False:
        return False

    content_urls = []

    get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing')
    tree = lxml.html.fromstring(content)
    urls = tree.xpath('//td[@class="title"]/a/@href')
    if urls == []:
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('start download2')
        content = process._downloader.downloader_wrapper(url,
                                                         batch_id,
                                                         gap,
                                                         timeout=timeout,
                                                         encoding='gb18030',
                                                         refresh=True)
        if content is False:
            return False
        tree = lxml.html.fromstring(content)
        urls = tree.xpath('//td[@class="title"]/a/@href')

    for url in urls:
        content_urls.append(urlparse.urljoin('http://data.eastmoney.com/',
                                             url))

    get_logger(batch_id, today_str,
               '/opt/service/log/').info('start put content')
    manager.put_urls_enqueue('dongcaigonggao-content-20160620', content_urls)

    return True
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name =  Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))
    if not hasattr(process, '_regs'):
        setattr(process, '_regs', {
            'home_page' : re.compile('http://www.zyctd.com/zixun/'),
            'list_page' : re.compile('http://www.zyctd.com/zixun-(\d+).html')
        })
    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout= int(timeout)
    gap = max(gap - other_batch_process_time, 0)
    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        print label
        if label == 'home_page':
            url_pattern = 'http://www.zyctd.com/zixun-{}.html'
            content = process._downloader.downloader_wrapper(
                url,
                batch_id,
                gap,
                timeout = timeout,
                encoding = 'utf-8',
                refresh = True)
            page_content = re.findall('var pageCount = parseInt\(\'(\d+)\'\)', content)  # 在网页元素中没有,通过js代码段获得
            if not page_content:
                return False
            page_num = int(page_content[0])
            
            urls = []
            for page in range(2,page_num):       # 根据页码将所有页加入队列
                urls.append(url_pattern.format(page))
            manager.put_urls_enqueue(batch_id, urls)

            result_list = parse_list_page(content)               # 首页本身作为第一页也有新闻信息,也要进行分析
            return process._cache.post(url, json.dumps(result_list, ensure_ascii=False), refresh=True)

        elif label == 'list_page':
            content = process._downloader.downloader_wrapper(
                url,
                batch_id,
                gap,
                timeout = timeout,
                encoding = 'utf-8',
                refresh = True)
            result_list = parse_list_page(content)
            return process._cache.post(url, json.dumps(result_list, ensure_ascii=False), refresh=True)
示例#5
0
def process(url, batch_id, parameter, manager, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name =  Downloader.url2domain(url)
        headers = {"Host": domain_name}
        setattr(process, '_downloader', DownloadWrapper('s3', headers, REGION_NAME))
    if not hasattr(process, '_cache'):
        setattr(process, '_cache', CacheS3(batch_id.split('-', 1)[0]+'-json'))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout= int(timeout)

    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str, '/opt/service/log/').info('start download content')
    content = process._downloader.downloader_wrapper(url,
        batch_id,
        gap,
        timeout=timeout,
        encoding='gb18030')

    if kwargs and kwargs.get("debug"):
        print(len(content), "\n", content[:1000])

    if content is False:
        return False

    get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing content')
    begin = content.find('<div class="mainbox">')
    end = content.find('<div id="footer">', begin)
    tree = lxml.html.fromstring(content[begin:end])
    title = tree.xpath('//div[@class="content"]/h4/text()')
    if isinstance(title, list) and len(title) > 0:
        title = title[0]
    else:
        title = None
    public_date = tree.xpath('//div[@class="content"]/h5/text()')
    if isinstance(public_date, list) and len(public_date) > 0:
        public_date = public_date[0]
    else:
        public_date = None
    body = tree.xpath('//div[@class="content"]//pre/text()')
    if isinstance(body, list) and len(body) > 0:
        body = body[0]
    else:
        body = None
    notice_content = json.dumps({'url': url, 'title': title, 'public_date': public_date, 'body': body})

    get_logger(batch_id, today_str, '/opt/service/log/').info('start post json')
    ret = process._cache.post(url, notice_content)
    return ret
示例#6
0
def process(url, batch_id, parameter, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        setattr(
            process, '_downloader',
            DownloadWrapper(CACHE_SERVER, THE_CONFIG['crawl_http_headers']))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)

    content = process._downloader.downloader_wrapper(
        url,
        batch_id,
        gap,
        timeout=timeout,
        encoding=THE_CONFIG['crawl_result_content_encoding'],
        refresh=THE_CONFIG['crawl_refresh'])

    if content is False:
        return False

    return True
示例#7
0
class ZhidaoPrefetch(object):
    def __init__(self, config):
        print config, "-----"
        self.config = config
        self.counter = collections.Counter()
        self.cache = Cache(self.config["batch_ids"]["json"],
                           self.config["cache_server"])
        self.downloader = DownloadWrapper(self.config["cache_server"],
                                          self.config["http_headers"])

    def is_debug(self):
        return self.config.get("debug", False)

    def zhidao_results(self, qids):
        q_jsons = []
        for qid in qids:
            q_json = self.zhidao_question(qid)
            if q_json is False:
                continue
            q_json["list_answers"] = []

            for rid in q_json["answer_ids"][:3]:
                a_json = self.zhidao_answer(qid, rid)
                if a_json is False:
                    continue
                q_json["list_answers"].append(a_json)

            q_jsons.append(q_json)
        return q_jsons

    def zhidao_question(self, qid):
        question_url = "http://zhidao.baidu.com/question/{}.html".format(qid)
        if self.is_debug():
            print question_url
        ret = self.downloader.downloader_wrapper(
            question_url,
            self.config["batch_ids"]["question"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"])
        if ret is False:
            return False
        q_json = generate_question_json(qid, ret)
        if q_json is None or q_json == {}:
            return False
        success = self.cache.post(question_url, q_json)
        return q_json

    def zhidao_answer(self, qid, rid):
        answer_url = ("http://zhidao.baidu.com/question/api/mini?qid={}"
                      "&rid={}&tag=timeliness".format(qid, rid))

        #print self.config["crawler"]
        if self.is_debug():
            print answer_url
        ret = self.downloader.downloader_wrapper(
            answer_url,
            self.config["batch_ids"]["answer"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"])
        if ret is False:
            return False
        try:
            a_json = generate_answer_json(ret)
        except:
            return False

        success = self.cache.post(answer_url, a_json)
        return a_json

    def zhidao_search(self, query, page_number=None, start_result_index=0):
        if isinstance(query, unicode):
            query = query.encode("utf-8")

        if page_number is None or page_number == 0:
            query_url = "http://zhidao.baidu.com/search/?word={}".format(
                urllib.quote(query))
        else:
            query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format(
                page_number * 10, urllib.quote(query))
        if self.is_debug():
            print query_url
        # query_url = "http://zhidao.baidu.com/search?word={}".format(quote_word)

        #print query
        #print query_url
        ret = self.downloader.downloader_wrapper(
            query_url,
            self.config["batch_ids"]["search"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"],
            refresh=False)
        # resp.headers: "content-type": "text/html;charset=UTF-8",
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False
        else:
            return parse_search_json_v0615(
                ret, start_result_index=start_result_index)

    def run_query(self, query, max_page):
        self.counter["query"] += 1
        qids_select = set()
        result_all = []
        for page_number in range(max_page):
            print "==== page ", page_number, query
            self.counter["page"] += 1

            result_local = self.zhidao_search(query, page_number,
                                              len(result_all))
            #print json.dumps( result_local, ensure_ascii=False, indent=4, sort_keys=True)
            result_all.extend(result_local)
            self.counter["q_total"] += len(result_local)

            for item in result_local:
                item["query"] = query
                if type(query) != unicode:
                    item["query"] = query.decode("utf-8")
                #print item
                if item["source"] == "recommend" or (item["cnt_like"] >= 3):
                    self.counter["q_good"] += 1
                    qids_select.add(item["question_id"])
                    print item["source"], item["cnt_like"], item[
                        "cnt_answer"], item['question'], "<----", item[
                            'answers']
                else:
                    print item["source"], item["cnt_like"], item[
                        "cnt_answer"], item['question']
            print datetime.datetime.now().isoformat(), self.counter
        return result_all
        #qajson = self.zhidao_results(qids_select)
        #print json.dumps(qajson, ensure_ascii=False, indent=4)

    def run_query_entity(self):
        filename = getTheFile("seed_entity.human.txt")
        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue

                self.run_query(line, 10)

    def run_query_batch(self, filename, limit):
        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue
                self.run_query(line, limit)

    def run_gen_url_search_realtime(self, filename):
        lines = libfile.file2list(filename)
        visited = set()
        for line in sorted(lines):
            for query_parser in [0]:
                query_url, qword = zhidao_fetch.get_search_url_qword(
                    line, query_parser=query_parser)

                if query_url in visited:
                    continue
                visited.add(query_url)
                print qword, query_url

        print len(visited)
        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "_urls.txt")))
        libfile.lines2file(sorted(list(visited)), filename_output)

    def run_test_search_realtime(self, filename, limit):
        results = []
        counter = collections.Counter()

        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue
                ret = self.run_query(line, limit)
                counter["query"] += 1
                for item in ret:
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter

    def run_get_best_search_realtime(self, filename):
        results = []
        counter = collections.Counter()

        lines = libfile.file2list(filename)
        for query_parser in [0]:
            for line in sorted(lines):
                cnt_label = "query_{}".format(query_parser)
                if counter[cnt_label] % 10 == 0:
                    print datetime.datetime.now().isoformat(
                    ), counter[cnt_label], line
                counter[cnt_label] += 1

                ret_one = search_zhidao_best(line,
                                             query_filter=0,
                                             query_parser=query_parser)
                if ret_one:
                    item = ret_one["best_qapair"]

                    print "=====>", line
                    print "------", item["match_score"], item["question"]
                    print item["answers"], "*******", item["answers_raw"][
                        len(item["answers"]):]

                    for p in ["query"]:
                        item[p] = ret_one[p]
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter
示例#8
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name =  Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_regs'):
        setattr(process, '_regs', {
            'main': re.compile(r'http://china.chemnet.com/hot-product/(\w|\d+).html'),
            'prd': re.compile(r'http://china.chemnet.com/product/pclist--(.+?)--0.html'),
            'comps': re.compile(r'http://china.chemnet.com/product/search.cgi')
        })
    def safe_state(statement):
        return statement[0] if statement else ''

    def xpath_string(n):
        return "//*[@id=\"main\"]/div[1]/div[1]/table/tr[" + str(n) + "]/td[2]/text()"
        
    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout= int(timeout)
    compspat = 'http://china.chemnet.com/product/search.cgi?skey={};use_cas=0;f=pclist;p={}'
    today_str = datetime.now().strftime('%Y%m%d')

    # if kwargs and kwargs.get("debug"):
    #     get_logger(batch_id, today_str, '/opt/service/log/').info('start download')
    content = process._downloader.downloader_wrapper(url,
        batch_id,
        gap,
        timeout=timeout,
        # encoding='gb18030',
        refresh=True
        )
    # print(content)
    if content == '':
        get_logger(batch_id, today_str, '/opt/service/log/').info(url + ' no content')
        return False
    

    # if kwargs and kwargs.get("debug"):
    get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url')

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        page = etree.HTML(content.replace('<sub>', '').replace('</sub>', ''))
        if label == 'main':
            # print("add chems")
            chems = page.xpath("//*[@id=\"main\"]/div[1]/div[2]/dl/dd/ul/li/p[2]/a/@href")  # links for chems in main page
            chems = [ urlparse.urljoin(SITE, chem) for chem in chems]
            get_logger(batch_id, today_str, '/opt/service/log/').info('adding chems urls into queue')
            manager.put_urls_enqueue(batch_id, chems)
            return True

        elif label == 'prd':
            chem_uri = m.group(1)
            chem_name = page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[1]/td[2]/text()")[0]
            get_logger(batch_id, today_str, '/opt/service/log/').info(chem_name + " main page")
            
            comps = page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/form/table/tr[1]/td[2]/a[1]")
            pagetext = page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/h6/div/text()[1]")
            # print(pagetext[0])
            total = int(re.compile(r'共有(\d+)条记录').search(pagetext[0].encode('utf-8')).group(1))
            total = total // 10 + 1 if total % 10 != 0 else total // 10
            dic = {
                            u'source': url,
                            u'中文名称': page.xpath(xpath_string(1))[0] if page.xpath(xpath_string(1)) else '',
                            u'英文名称': page.xpath(xpath_string(2))[0] if page.xpath(xpath_string(2)) else '',
                            u'中文别名': page.xpath(xpath_string(3))[0] if page.xpath(xpath_string(3)) else '',
                            u'CAS_RN': page.xpath(xpath_string(4))[0] if page.xpath(xpath_string(4)) else '',
                            u'EINECS': page.xpath(xpath_string(5))[0] if page.xpath(xpath_string(5)) else '',
                            u'分子式': page.xpath(xpath_string(6))[0] if page.xpath(xpath_string(6)) else '',
                            u'分子量': page.xpath(xpath_string(7))[0] if page.xpath(xpath_string(7)) else '',
                            u'危险品标志': page.xpath(xpath_string(8))[0].strip() if page.xpath(xpath_string(8)) else '',
                            u'风险术语': page.xpath(xpath_string(9))[0].strip() if page.xpath(xpath_string(9)) else '',
                            u'安全术语': page.xpath(xpath_string(10))[0].strip() if page.xpath(xpath_string(10)) else '',
                            u'物化性质': page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[11]/td[2]/p/text()") if page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[11]/td[2]/p/text()") else [],
                            u'用途': page.xpath(xpath_string(12))[0]  if page.xpath(xpath_string(12)) else '',
                            u'上游原料': page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[14]/td[2]/a/text()') if page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[14]/td[2]/a/text()') else [],
                            u'下游产品': page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[15]/td[2]/a/text()') if page.xpath('//*[@id=\"main\"]/div[1]/div[1]/table/tr[15]/td[2]/a/text()') else [],
                }
            data = json.dumps(dic, encoding='utf-8', ensure_ascii=False)
            new_urls = []
            for t in range(total):
                new_url = compspat.format(chem_uri, str(t))
                get_logger(batch_id, today_str, '/opt/service/log/').info("new url" + new_url)
                new_urls.append(new_url)
            manager.put_urls_enqueue(batch_id, new_urls)
            get_logger(batch_id, today_str, '/opt/service/log/').info('start posting prd page to cache')
            return process._cache.post(url, data)

        else:
            chem_name = page.xpath("//*[@id=\"main\"]/div[1]/div[1]/table/tr[1]/td[2]/text()")[0]
            total = len(page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/form"))   # total num of suppliers
            dic = ''
            for i in range(1, total + 1):
                c = safe_state(page.xpath("//*[@id=\"main\"]/div[2]/div[2]/dl/dd/form[{}]".format(str(i))))
                if c is '':
                    break
                comp = {}
                comp[u'source'] = url
                comp[u'chem_name'] = chem_name
                comp[u'name'] = safe_state(c.xpath(".//table/tr[1]/td[2]/a[1]/text()"))
                comp[u'tel'] = safe_state(c.xpath(".//table/tr[2]/td[2]/text()"))
                comp[u'fax'] =  safe_state(c.xpath(".//table/tr[3]/td[2]/text()"))
                comp[u'website'] = safe_state(c.xpath(".//table/tr[4]/td[2]/a/text()"))
    
                dic += json.dumps(comp, encoding='utf-8', ensure_ascii=False) + '\n'
            dic = dic.strip()
            get_logger(batch_id, today_str, '/opt/service/log/').info('start posting companies to cache')
            return process._cache.post(url, dic)
示例#9
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    # 药材的详情页涉及2个部分:价格历史history和边栏sidebar,以下的ytw/second/是价格历史的url,返回一个大的json;
    # 所以在最后处理的时候还要额外向另一个url发送一次请求,以获得边栏信息,由于要储存到同一个result.json中,因此不再放入队列,而是直接在process里完成

    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str,
               '/opt/service/log/').info('process {}'.format(url))
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'list_view':
                re.compile(
                    'http://www.yt1998.com/price/nowDayPriceQ\!getPriceList.do\?pageIndex=(\d+)&pageSize=(\d+)'
                ),
                'detail_view':
                re.compile(
                    'http://www.yt1998.com/ytw/second/priceInMarket/getPriceHistory.jsp\?ycnam=(.*)&guige=(.*)&chandi=(.*)&market=(.*)'
                )
            })

    if not hasattr(process, '_sellerMarket_list'):
        setattr(process, '_sellerMarket_list',
                ['', u'亳州市场', u'安国市场', u'玉林市场', u'成都市场'])

    # http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex=0&pageSize=500
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)
    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('label : {}'.format(label))
        if label == 'list_view':
            get_logger(batch_id, today_str, '/opt/service/log/').info(label)
            content = process._downloader.downloader_wrapper(url,
                                                             batch_id,
                                                             gap,
                                                             timeout=timeout,
                                                             encoding='utf-8',
                                                             refresh=True)
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('download ok')
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info(len(content))
            list_item = json.loads(content)
            urls = []
            for detail_item in list_item[u'data']:
                detail_url_pattern = 'http://www.yt1998.com/ytw/second/priceInMarket/getPriceHistory.jsp?ycnam={}&guige={}&chandi={}&market={}'
                ycnam = str(detail_item[u'ycnam'])
                chandi = str(detail_item[u'chandi'])
                market = str(detail_item[u'market'])
                guige = str(detail_item[u'guige'])
                detail_url = detail_url_pattern.format(urllib.quote(ycnam),
                                                       urllib.quote(guige),
                                                       urllib.quote(chandi),
                                                       urllib.quote(market))
                urls.append(detail_url)
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('len urls')
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info(len(urls))
            manager.put_urls_enqueue(batch_id, urls)

            total_num = int(list_item[u'total'])
            pageIndex = int(m.group(1))
            pageSize = int(m.group(2))
            if pageIndex == 0:
                print(total_num // pageSize)
                for index in range(1, total_num // pageSize + 1):
                    get_logger(batch_id, today_str,
                               '/opt/service/log/').info('iiiiiindex')
                    get_logger(batch_id, today_str,
                               '/opt/service/log/').info(index)
                    list_pattern = 'http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex={}&pageSize={}'
                    list_url = list_pattern.format(index, pageSize)
                    manager.put_urls_enqueue(batch_id, [list_url])
            return True
        elif label == 'detail_view':
            get_logger(batch_id, today_str, '/opt/service/log/').info(label)
            ycnam = urllib.unquote(m.group(1))
            guige = urllib.unquote(m.group(2))
            chandi = urllib.unquote(m.group(3))
            market = urllib.unquote(m.group(4))
            content = process._downloader.downloader_wrapper(url,
                                                             batch_id,
                                                             gap,
                                                             timeout=timeout,
                                                             encoding='utf-8',
                                                             refresh=True)
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info(len(content))
            history_item = json.loads(content)
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('downloaded')
            price_history = {}
            for raw_daily_data in history_item[u'DayPriceData']:
                date = raw_daily_data[u'Date_time']
                price = raw_daily_data[u'DayCapilization']
                price_history[date] = price
            source_url = 'http://www.yt1998.com/priceHistory.html?keywords={}&guige={}&chandi={}&market={}'
            get_logger(batch_id, today_str, '/opt/service/log/').info('source')
            get_logger(batch_id, today_str, '/opt/service/log/').info(
                len(process._sellerMarket_list))
            result_item = {
                'name': ycnam,
                'productGrade': guige,
                'productPlaceOfOrigin': chandi,
                'sellerMarket': process._sellerMarket_list[int(market)],
                'price_history': price_history,
                'source': source_url.format(ycnam, guige, chandi, market),
            }
            print(result_item)
            result_item['access_time'] = datetime.utcnow().isoformat(
            )  # 从上面source的赋值可看出每个item都对应不同的参数
            return process._cache.post(url,
                                       json.dumps(result_item,
                                                  ensure_ascii=False),
                                       refresh=True)
示例#10
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'main':
                re.compile(
                    r'http://www.zysj.com.cn/zhongyaocai/index__\d+.html'),
                'prd':
                re.compile(
                    r'http://www.zysj.com.cn/zhongyaocai/yaocai_\w/(.+?).html')
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout = int(timeout)
    today_str = datetime.now().strftime('%Y%m%d')
    # if kwargs and kwargs.get("debug"):
    #     get_logger(batch_id, today_str, '/opt/service/log/').info('start download')
    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     timeout=timeout)
    # print(content)
    if content == '':
        # print("no content")
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info(url + ' no content')
        return False

    # content.encoding='gb18030'
    # if kwargs and kwargs.get("debug"):
    # get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url')

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        page = etree.HTML(content)
        if label == 'main':
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info("adding Chinese Meds")
            meds = page.xpath("//*[@id=\"list\"]/ul/li/a/@href"
                              )  # links for meds in main page
            meds = [urlparse.urljoin(SITE, med) for med in meds]
            # print(meds[:5])
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('adding Meds urls into queue')
            manager.put_urls_enqueue(batch_id, meds)
            return True

        elif label == 'prd':
            med_name = page.xpath("//*[@id=\"article\"]/h1/text()")[0]
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info(med_name + " main page")
            # print(med_name,"main page")
            book_list = []
            dictionary = {}
            books = content.split('<hr />')  # 用来分开不同的药典
            if len(books) == 2:  # 只有一个药典的情况
                books = [books[0]]
            else:  # 有多个药典的情况
                books = books[1:-1]
            for book in books:
                page = etree.HTML(
                    book.replace('<strong>',
                                 '').replace('</strong>', '').replace(
                                     '<sub>', '').replace('</sub>', ''))

                med_info = page.xpath("//p/text()")
                data = {}
                # data['source'] = url
                dictionary['source'] = url

                # data['access_time'] = datetime.utcnow().isoformat()
                dictionary['access_time'] = datetime.utcnow().isoformat()
                data_list = []
                for info in med_info:

                    m = re.compile(r'【.+?】').match(info.encode('utf-8'))
                    if m:
                        prop = m.group(0)[3:-3]
                        cleaned = re.sub(r'【.+?】', '', info.encode('utf-8'))
                        data[prop] = cleaned
                        data_list.append({prop: cleaned})
                    else:
                        data[prop] += '\n' + info.encode('utf-8')
                        data_list[-1][prop] += '\n' + info.encode('utf-8')
                book_name = data['摘录']
                # dics[book_name] = data
                book_list.append({book_name: data_list})  # 为了保持原书籍的顺序,使用列表结构

            dictionary[data['药材名称']] = book_list
            dictionary = json.dumps(dictionary,
                                    encoding='utf-8',
                                    ensure_ascii=False)
            get_logger(
                batch_id, today_str,
                '/opt/service/log/').info('start posting prd page to cache')
            return process._cache.post(url, dictionary)
示例#11
0
def process(url, batch_id, parameter, manager, *args, **kwargs):
    # 药材的详情页涉及2个部分:价格历史history和边栏sidebar,以下的ytw/second/是价格历史的url,返回一个大的json;
    # 所以在最后处理的时候还要额外向另一个url发送一次请求,以获得边栏信息,由于要储存到同一个result.json中,因此不再放入队列,而是直接在process里完成

    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str,
               '/opt/service/log/').info('process {}'.format(url))
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader',
                DownloadWrapper(None, headers, REGION_NAME))

    if not hasattr(process, '_regs'):
        setattr(
            process,
            '_regs',
            {
                'home':
                re.compile('http://www.yt1998.com/variteyIndexInfo.html'),
                'kind':
                re.compile('http://www.yt1998.com/issueIndexInfo.html\?code='),
                'history':
                re.compile(
                    'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp\?code=(\d+)&type=1&varitey_name=(.*)'
                )  #这是价格历史的url
            })

    # if not hasattr(process, '_cache'):
    #     head, tail = batch_id.split('-')
    #     setattr(process, '_cache', CacheS3(head + '-json-' + tail))

    if not hasattr(process, '_next_patterns'):
        setattr(
            process,
            '_next_patterns',
            {
                'home':
                'http://www.yt1998.com/issueIndexInfo.html?code={}',  #the format of kind
                'kind':
                'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp?code={}&type=1&varitey_name={}',  #the format of history
                'history':
                'http://www.yt1998.com/variteyIndexInfo.html?varitey_code={}'  #the format of sidebar
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue

        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('label : {}'.format(label))
        if label in [
                'home', 'kind'
        ]:  #I handle home-page and kind-page in one code block cuz they are in same web format
            content = process._downloader.downloader_wrapper(url,
                                                             batch_id,
                                                             gap,
                                                             timeout=timeout,
                                                             encoding='utf-8',
                                                             refresh=True)
            dom = lxml.html.fromstring(content)
            dd_labels = dom.xpath('//dd')
            urls = []

            for single_dd in dd_labels:
                rels = single_dd.xpath('.//@rel')
                if not rels:
                    get_logger(batch_id, today_str, '/opt/service/log/').info(
                        'wrong rels content : {}'.format(rels))
                    continue
                for rel in rels:
                    code = rel.split(
                        ','
                    )[-2]  #在home页面,rel的格式为 'Z,家种类' code为Z;在kind页面,rel格式为'Z,家种类,000001,枸杞',code为000001
                    if label == 'home':
                        urls.append(process._next_patterns[label].format(code))
                    else:  # label == 'kind'
                        name = str(rel.split(',')[-1])
                        urls.append(process._next_patterns[label].format(
                            code, urllib.quote(name)))

            manager.put_urls_enqueue(batch_id, urls)

        elif label == 'history':  #开始提取单种药品数据
            #由于之前的设计,传进来的是历史价格的url,在更新的时候已经用不到,但是为了尽量一致,减少变动,采用传入
            #历史记录url,再提取其中的参数组成边栏url,发送请求得到当日价格的逻辑
            code = m.group(1)
            name = urllib.unquote(m.group(2))
            sidebar_url = process._next_patterns[label].format(code)
            sidebar_content = process._downloader.downloader_wrapper(
                sidebar_url,
                batch_id,
                gap,
                timeout=timeout,
                encoding='utf-8',
                refresh=True)

            sidebar_dom = lxml.html.fromstring(sidebar_content)
            sidebar_label = sidebar_dom.xpath(
                '//div[@class="box-con-r fr"]/table//tr')
            if not isinstance(sidebar_label, list) or len(sidebar_label) != 19:
                get_logger(batch_id, today_str,
                           '/opt/service/log/').info('not legal list!')
                return False

            for index in range(1, 16):
                line_content = sidebar_label[index].xpath(
                    './td/text()')  #line content格式为 权重比:0.0278、市净率:2.00...
                parts = line_content[0].split(
                    ':')  # chinese colon :left part as key,right part as value

                if parts[0] == u'当前价格':
                    # print ('相等')
                    today_price = parts[1]
                    break

            result_item = {}
            result_item['today_price'] = today_price
            result_item['name'] = name
            result_item['url'] = sidebar_url

            return True  # 之后更改为新的cache
示例#12
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    # 药材的详情页涉及2个部分:价格历史history和边栏sidebar,以下的ytw/second/是价格历史的url,返回一个大的json;
    # 所以在最后处理的时候还要额外向另一个url发送一次请求,以获得边栏信息,由于要储存到同一个result.json中,因此不再放入队列,而是直接在process里完成

    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str,
               '/opt/service/log/').info('process {}'.format(url))
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process, '_regs'):
        setattr(
            process,
            '_regs',
            {
                'home':
                re.compile('http://www.yt1998.com/variteyIndexInfo.html'),
                'kind':
                re.compile('http://www.yt1998.com/issueIndexInfo.html\?code='),
                'history':
                re.compile(
                    'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp\?code=(\d+)&type=1&varitey_name=(.*)'
                )  #这是价格历史的url
            })

    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_next_patterns'):
        setattr(
            process,
            '_next_patterns',
            {
                'home':
                'http://www.yt1998.com/issueIndexInfo.html?code={}',  #the format of kind
                'kind':
                'http://www.yt1998.com/ytw/second/indexMgr/getIndexInfo.jsp?code={}&type=1&varitey_name={}',  #the format of history
                'history':
                'http://www.yt1998.com/variteyIndexInfo.html?varitey_code={}'  #the format of sidebar
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('label : {}'.format(label))
        if label in [
                'home', 'kind'
        ]:  #I handle home-page and kind-page in one code block cuz they are in same web format
            content = process._downloader.downloader_wrapper(url,
                                                             batch_id,
                                                             gap,
                                                             timeout=timeout,
                                                             encoding='utf-8',
                                                             refresh=True)

            dom = lxml.html.fromstring(content)
            dd_labels = dom.xpath('//dd')
            urls = []
            for single_dd in dd_labels:
                rels = single_dd.xpath('.//@rel')
                if not rels:
                    get_logger(batch_id, today_str, '/opt/service/log/').info(
                        'wrong rels content : {}'.format(rels))
                    continue
                for rel in rels:
                    code = rel.split(
                        ','
                    )[-2]  #在home页面,rel的格式为 'Z,家种类' code为Z;在kind页面,rel格式为'Z,家种类,000001,枸杞',code为000001
                    if label == 'home':
                        urls.append(process._next_patterns[label].format(code))
                    else:  # label == 'kind'
                        name = str(rel.split(',')[-1])
                        urls.append(process._next_patterns[label].format(
                            code, urllib.quote(name)))
            manager.put_urls_enqueue(batch_id, urls)

        elif label == 'history':  #开始提取单种药品数据
            code = m.group(1)
            name = urllib.unquote(m.group(2))
            sidebar_url = process._next_patterns[label].format(code)
            sidebar_content = process._downloader.downloader_wrapper(
                sidebar_url,
                batch_id,
                gap,
                timeout=timeout,
                encoding='utf-8',
                refresh=True)

            sidebar_dom = lxml.html.fromstring(sidebar_content)
            sidebar_label = sidebar_dom.xpath(
                '//div[@class="box-con-r fr"]/table//tr')
            if not isinstance(sidebar_label, list) or len(sidebar_label) != 19:
                get_logger(batch_id, today_str,
                           '/opt/service/log/').info('not legal list!')
                return False

            sidebar_item = {}  # 边栏信息
            for index in range(1, 16):
                line_content = sidebar_label[index].xpath(
                    './td/text()')  #line content格式为 权重比:0.0278、市净率:2.00...
                parts = line_content[0].split(
                    ':')  # chinese colon :left part as key,right part as value
                sidebar_item[parts[0]] = parts[1]

            line_content = sidebar_label[16].xpath(
                './th/text()')  #最后更新时间的样式与其他不同,为th
            parts = line_content[0].split(':')
            sidebar_item[parts[0]] = parts[1]

            history_content = process._downloader.downloader_wrapper(
                url,
                batch_id,
                gap,
                timeout=timeout,
                encoding='utf-8',
                refresh=True)

            if history_content == '':
                return False
            get_logger(
                batch_id, today_str,
                '/opt/service/log/').info('history downloading finished')

            history_item = json.loads(history_content)[
                u'DayMonthData']  #从结果中提取每天数据
            price_history = {}  #价格历史
            for raw_daily_data in history_item:
                date = raw_daily_data[u'Date_time']
                price = raw_daily_data[u'DayCapilization']
                price_history[date] = price

            result_item = {}
            result_item['name'] = name
            result_item['info'] = sidebar_item
            result_item['price_history'] = price_history
            result_item['source'] = sidebar_url
            return process._cache.post(url,
                                       json.dumps(result_item,
                                                  ensure_ascii=False),
                                       refresh=True)
示例#13
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    print(url)
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader',
                DownloadWrapper(None, headers, REGION_NAME))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'column_id':
                re.compile('(\d+)'),
                'pages_view':
                re.compile(
                    'http://www.yt1998.com/ytw/second/marketMgr/query.jsp\?lmid=(\d+?)&(.*)'
                )
            })
    if not hasattr(process, '_cache'):
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)
    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        if label == 'column_id':
            query_url = 'http://www.yt1998.com/ytw/second/marketMgr/query.jsp'
            column_id = url
            page_size = 10
            data = {
                'lmid': column_id,  # 栏目id,lm是栏目的首字母! 9代表产地信息,1代表品种分析,3代表天天行情
                # 'scid':'1',                       # 对于天天行情,存在scid=市场id,但是尝试不传递这个参数,就会返回所有市场的新闻。且在返回值内依然可以找到市场字段,不会丢失信息。
                'pageIndex': '0',
                'pageSize': page_size,
                'times': '1',  # 非必要参数
            }
            content = process._downloader.downloader_wrapper(query_url,
                                                             batch_id,
                                                             gap,
                                                             method='post',
                                                             data=data,
                                                             timeout=timeout,
                                                             encoding='utf-8',
                                                             refresh=True)
            news_info = json.loads(content)
            total = int(news_info[u'total'])  # 得出新闻总数,以此生成子任务
            url_pattern = 'http://www.yt1998.com/ytw/second/marketMgr/query.jsp?lmid={}&times=1&pageIndex={}&pageSize={}'
            urls = []
            for index in range(0, total / page_size + 1):
                url = url_pattern.format(column_id, index, page_size)
                if not check_date_ok(url):
                    break
                urls.append(url)
            manager.put_urls_enqueue(batch_id, urls)

        elif label == 'pages_view':
            content = process._downloader.downloader_wrapper(url,
                                                             batch_id,
                                                             gap,
                                                             method='get',
                                                             timeout=timeout,
                                                             encoding='utf-8',
                                                             refresh=True)
            item = json.loads(content)
            news_data = item[u'data']
            menu_dic = {
                '1': u'品种分析',
                '3': u'天天行情',
                '9': u'产地信息',
            }
            result_list = []
            detail_pattern = 'http://www.yt1998.com/hqMinute--{}.html'
            for news in news_data:
                result = {
                    'news_title': news[u'title'],
                    'news_url': detail_pattern.format(news[u'acid']),
                    'news_desc': news[u'cont'].strip(),
                    'news_date': news[u'dtm'],
                    'news_keyword_list':
                    [news[u'ycnam']],  # ycname = 药材nam = 药材name 取名逻辑很复杂
                    'access_time': datetime.datetime.utcnow().isoformat(),
                    'market': news[u'market'],
                }
                result['news_type'] = menu_dic[news[u'lmid']]
                if news[u'lmid'] == '3':  # 天天行情为快讯,是短新闻,不用再去取正文
                    result['news_content'] = result['news_desc']
                else:  # 其它栏目进行正文爬取
                    result['news_content'] = get_news_content(
                        result['news_url'], batch_id, gap, timeout)
                result_list.append(result)
            return process._cache.post(url,
                                       json.dumps(result_list,
                                                  ensure_ascii=False),
                                       refresh=True)
示例#14
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    home_page = 'http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=25&tableName=TABLE25&title=%B9%FA%B2%FA%D2%A9%C6%B7&bcId=124356560303886909015737447882'
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process, '_reg'):
        setattr(
            process, '_reg', {
                'detail':
                re.compile(
                    'http://app1.sfda.gov.cn/datasearch/face3/content.jsp\?tableId=25&tableName=TABLE25&tableView=%B9%FA%B2%FA%D2%A9%C6%B7&Id=(\d+)'
                ),
            })

    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)

    today_str = datetime.now().strftime('%Y%m%d')

    if kwargs and kwargs.get("debug"):
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('start download')

    data = {
        'tableId': '25',
        'State': '1',
        'bcId': '124356560303886909015737447882',
        'State': '1',
        'curstart': 1,  #here!
        'State': '1',
        'tableName': 'TABLE25',
        'State': '1',
        'viewtitleName': 'COLUMN167',
        'State': '1',
        'viewsubTitleName': 'COLUMN166,COLUMN170,COLUMN821',
        'State': '1',
        'tableView': '%E5%9B%BD%E4%BA%A7%E8%8D%AF%E5%93%81',
        'State': '1',
    }

    if url == home_page:
        if kwargs and kwargs.get("debug"):
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('start parsing url')
        page = 1
        while 1:
            data['curstart'] = page
            content = process._downloader.downloader_wrapper(
                'http://app1.sfda.gov.cn/datasearch/face3/search.jsp',
                batch_id,
                gap,
                method='post',
                timeout=timeout,
                refresh=True,
                data=data)
            # if page == 3:
            #     return
            ids = re.findall(u'国产药品&Id=(\d+)', content)
            if not ids:
                break
            url_pattern = 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=25&tableName=TABLE25&tableView=%B9%FA%B2%FA%D2%A9%C6%B7&Id={}'
            urls = []
            for drug_id in ids:
                url = url_pattern.format(drug_id)
                urls.append(url)
            manager.put_urls_enqueue(batch_id, urls)
            page += 1
            if kwargs and kwargs.get("debug"):
                get_logger(batch_id, today_str, '/opt/service/log/').info(
                    'going to page{}'.format(page))

        return

    elif process._reg['detail'].match(url):

        content = process._downloader.downloader_wrapper(
            url,
            batch_id,
            gap,
            timeout=timeout,
        )
        if content == '':
            return False
        if kwargs and kwargs.get("debug"):
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('start parsing url')
        dom = lxml.html.fromstring(content)
        table = dom.xpath('//tr')

        item = {
            'license_number':
            table[1].xpath('./td')[1].xpath('./text()'),  #[u'批准文号'],
            'product_name_chs':
            table[2].xpath('./td')[1].xpath('./text()'),  #[u'产品名称'],
            'product_name_eng':
            table[3].xpath('./td')[1].xpath('./text()'),  #[u'英文名称'],
            'commodity_name_chs':
            table[4].xpath('./td')[1].xpath('./text()'),  #[u'商品名'],
            'drug_form':
            table[5].xpath('./td')[1].xpath('./text()'),  #[u'剂型'],
            'specification':
            table[6].xpath('./td')[1].xpath('./text()'),  #[u'规格'],
            'manufacturer_chs':
            table[7].xpath('./td')[1].xpath('./text()'),  #[u'生产单位'],
            'manuf_address_chs':
            table[8].xpath('./td')[1].xpath('./text()'),  #[u'生产地址'],
            'category':
            table[9].xpath('./td')[1].xpath('./text()'),  #[u'产品类别'],
            'license_data':
            table[11].xpath('./td')[1].xpath('./text()'),  #[u'批准日期'],
            'standard_code':
            table[12].xpath('./td')[1].xpath('./text()'),  #[u'药品本位码'],
            'standard_code_remark':
            table[13].xpath('./td')[1].xpath('./text()'),  #[u'药品本位码备注'],
            'source': [url],
        }
        for k, v in item.iteritems():
            if len(v) > 0:
                item[k] = v[0]
            else:
                item[k] = None

        return process._cache.post(url, json.dumps(item, ensure_ascii=False))
示例#15
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name =  Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_regs'):
        setattr(process, '_regs', {
            'main': re.compile(r'http://www.kmzyw.com.cn/bzjsp/biz_price_search/price_index_search.jsp'),
            'prd': re.compile(r'http://www.kmzyw.com.cn/bzjsp/Biz_price_history/price_history_search.jsp\?name=(.*?)')
        })

    def timestamp2datetime(timestamp):
        if isinstance(timestamp, (int, long, float)):
            dt = datetime.utcfromtimestamp(timestamp)
        else:
            return "Not a valid timestamp"
        mid = '-0' if dt.month < 10 else '-'
        return str(dt.year) + mid + str(dt.month) 

    post_form = {
                'pagecode': None,
                # 'search_site': '%25E4%25BA%25B3%25E5%25B7%259E',

    }

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout = int(timeout)
    today_str = datetime.now().strftime('%Y%m%d')
    
    get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url')
    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        
        if label == 'main':
            total_page = 10 # 初始化为一个较小的数,之后在获取页面内容后会更新此总页数
            page_num = 1
            while page_num < total_page + 1:

                post_form['pagecode'] = page_num 
                # print(page_num)
                content = process._downloader.downloader_wrapper(url,
                batch_id,
                gap,
                method='post',
                data=post_form,
                timeout=timeout,
                refresh=True
                )
                # print(content)
                data = json.loads(content)
                total_page = data['page']   # 从json中读出总页数
                drugs = []
                drug_url = 'http://www.kmzyw.com.cn/bzjsp/Biz_price_history/price_history_search.jsp?name={}'
                for row in data['rows']:
                    # print(row['drug_name'])
                    drugs.append(drug_url.format(urllib.quote(str(row['drug_name'])).replace('%', '%25')))
                manager.put_urls_enqueue(batch_id, drugs)
                page_num += 1
            
            return True

        elif label == 'prd':

            content = process._downloader.downloader_wrapper(
            url,
            batch_id,
            gap,
            timeout=timeout,
            refresh=True
            )
            page = etree.HTML(content)
            prd = page.xpath("/html/body/section[2]/h1/text()")[0]
            idx = prd.index(u'品种')
            prd = prd[:idx]
            get_logger(batch_id, today_str, '/opt/service/log/').info(prd + " main page")
            price_hist  = page.xpath("/html/head/script[12]/text()")[0]
            # print(price_hist)
            data_pat = re.compile(r'series : \[(.*),marker')
            m = data_pat.findall(price_hist)
            dics = ''
            if m:
                # print(m[0])
                data = m[0].split(',marker : { enabled : false ,radius : 3 } ,tooltip : { valueDecimals : 2 }},')
                for d in data:
                    name = 'name'
                    data = 'data'
                    dic = eval(d + '}')
                    # print(dic)
                    cleaned = {}
                    cleaned['source'] = url
                    cleaned['specs'] = dic['name']
                    cleaned['name'] = prd
                    cleaned['data'] = [ (timestamp2datetime(int(price[0]) // 1000), price[1]) for price in dic['data'] ]
                    cleaned['access_time'] = datetime.utcnow().isoformat()
                    dics += json.dumps(cleaned, encoding='utf-8') + '\n'

                
            else:
                get_logger(batch_id, today_str, '/opt/service/log/').info('not match')
            
            get_logger(batch_id, today_str, '/opt/service/log/').info('start posting prd page to cache')
            return process._cache.post(url, dics)
示例#16
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str,
               '/opt/service/log/').info('process {}'.format(url))
    if not hasattr(process, '_downloader'):
        headers = {}
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process, '_regs'):
        setattr(process, '_regs', {
            'first_letter': re.compile('^[A-Z]$'),
            'drug': re.compile('(\d+)')
        })

    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        if label == 'first_letter':
            first_letter = url
            data = {
                'Data':
                '{{\"url\": \"\", \"letter\": \"{}\"}}'.format(
                    first_letter)  # form data示例: Data:{"letter":"J","url":""} 
            }

            query_url = 'http://yaocai.zyctd.com/Ajax/AjaxHandle.ashx?CommandName=common/MCodexService/GetCodexNameByLetter'
            content = process._downloader.downloader_wrapper(query_url,
                                                             batch_id,
                                                             gap,
                                                             method='post',
                                                             data=data,
                                                             timeout=timeout,
                                                             refresh=True)
            if not content:
                return False

            drug_list = json.loads(content)

            MBID_list = []
            for drug in drug_list[u'Data']:
                MBID_list.append(str(drug[u'MBID']))
            manager.put_urls_enqueue(batch_id, MBID_list)  # 每个MBID为一个数字

        elif label == 'drug':
            mbid = url
            query_url = 'http://www.zyctd.com/Breeds/GetMCodexPoolListByMBID'
            data = {'mbid': '{}'.format(mbid), 'IsMarket': 'true'}
            content = process._downloader.downloader_wrapper(query_url,
                                                             batch_id,
                                                             gap,
                                                             method='post',
                                                             data=data,
                                                             timeout=timeout,
                                                             refresh=True)
            if not content:
                return False

            item = json.loads(content)
            sub_drug_list = item[u'Data']
            if not sub_drug_list:  # 请求成功然而列表为空,说明这种药材没有价格数据,属正常情况
                return True

            for sub_drug in sub_drug_list:  # eg:一个刀豆list里根据不同规格和产地的刀豆会分为不同的sub_drug,拥有不同的MBSID
                price_history_url = 'http://www.zyctd.com/Breeds/GetPriceTrend'
                data = {'MBSID': sub_drug['MBSID'], 'IsMarket': 'true'}
                price_content = process._downloader.downloader_wrapper(
                    price_history_url,
                    batch_id,
                    gap,
                    method='post',
                    data=data,
                    timeout=timeout,
                    refresh=True)
                if not price_content:
                    return False

                spec_info = sub_drug['MSpec'].split(' ')
                productGrade = spec_info[0]
                if len(spec_info) == 2:
                    productPlaceOfOrigin = spec_info[
                        1]  # MSpec一般情况示例: 大统 东北  OR 统 较广
                else:  # MSpec特殊情况示例: 统
                    productPlaceOfOrigin = ''
                price_item = json.loads(price_content)[u'Data']
                price_data = price_item[
                    u'PriceChartData']  # 注意price_data是一个字符串,需要再次loads后变为一个列表,每个列表代表一个药市,其中还嵌套着价格列表,价格里时间表示为时间戳等。

                if price_data == '[]':  # 即使存在这种规格,也有可能会没有价格历史
                    return True
                formatted_price_data = deal_with_price(price_data)
                result_item = {
                    'name': sub_drug['MName'],
                    'productGrade': productGrade,
                    'productPlaceOfOrigin': productPlaceOfOrigin,
                    'source':
                    'http://www.zyctd.com/jiage/xq{}.html'.format(mbid),
                    'access_time': datetime.utcnow().isoformat(),
                    'price_data': formatted_price_data
                }
                if not process._cache.post(str(sub_drug['MBSID']),
                                           json.dumps(result_item,
                                                      ensure_ascii=False),
                                           refresh=True):
                    return False
            return True
示例#17
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str,
               '/opt/service/log/').info('process {}'.format(url))

    if not hasattr(process, '_downloader'):
        headers = {
            'Cookie':
            'AJSTAT_ok_times=1; ant_stream_5762b612883d9=1470748235/1519574204; ASP.NET_SessionId=rpdjsrnmq3ybp0f4cnbdewm1; __utmt=1; bow_stream_5762b612883d9=13; __utma=240343830.1666180114.1470705813.1470719553.1470752966.3; __utmb=240343830.6.10.1470752966; __utmc=240343830; __utmz=240343830.1470705813.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
            'Host': 'datacenter.cngrain.com',
        }
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'home':
                re.compile('http://datacenter.cngrain.com/NewPrice.aspx'),
                'market':
                re.compile(
                    'http://datacenter.cngrain.com/PriceMainMark.aspx\?MarketId=(.*)'
                )
            })

    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_pattern'):
        setattr(
            process, '_pattern', {
                'market':
                'http://datacenter.cngrain.com/PriceMainMark.aspx',
                'history':
                'http://datacenter.cngrain.com/amline/PriceMarkXml.aspx{}'
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        if label == 'home':
            data = {
                #'__EVENTTARGET':, #最后两位表示页数,其值不能超过11
                '__EVENTARGUMENT': '',
                '__ZIPSTATE':
                'H4sIAAAAAAAEAOy9B2AcSZYlJi9tynt/SvVK1+B0oQiAYBMk2JBAEOzBiM3mkuwdaUcjKasqgcplVmVdZhZAzO2dvPfee++999577733ujudTif33/8/XGZkAWz2zkrayZ4hgKrIHz9+fB8/Iv7vX/M3/e1+7V/34bQtd3Z+t5Nq2ebL9mWZTfNvV+Usr3d/t+9+dfL7H9dtMS3z50XTHi+z8ropmt3f7fOffJq12W96Mft1dze/fbL8vM6K5dnyvLJvPf7k1/iNfs3f4Nf6NW982eu6//Jv+uv+2IPd+3s7Bwc7u7/p7Lf5tc7xz6/1a9O/vx79S81+7Nf8NX7T3+bX+QW/AV77vfLr5rf5NX7Bb/j7/65nbb44qdbL9tf6DWe/zW8jL/2avyka/jpv8nftr/vr/Rd/y9/6n//xf+Yv+A1fZJfFRdbmX9Xlr/s73D1bzvJ3L+vqslhO83HWrN79Hi/PZp/t7sxm9LoF8rv8Wr/ur/ef/81/83/2d/15v8uvPfzW3i699WuFb/0Xf/Nf+F/9sX/iprd279Nbv3anr7/hj/7P/4G/n9767Qffopd+nU5Xf/ef8Z//cX/mxq726K1ft/MW0YXfGuzqIb3064Uv/Vf/wN//n/8Nf/DGrh7QW79+Z1R/0h/9X/wFmxHcp7d+g85bf+6f+5//nX/+RrqDgj/WJ8Yfv7mve/RWErz1Y//13/sn/9f/wJ9Nc7bxxV2m4m/62/xav+DX/8miKSZlPgeUXx8f0Yu/8d3n1QXJxhm/osT7rbiTX4s6+R3/8z/pD/vP/9Q/+L/8K//4/+LP+4P+iz/jb/ov/pi//D/7e//O/+zv+oP+q7/yj/I4wXL67/Jr/Da/xu/ya/5av8vstzny2NpM3Q1j/BEf/4iPN/Lxb9jp7m+/6a17YKnfKHzrv/zT/qz/6g/5QzfRHvT4jTv0+OP+zP/s7/nrNr2EWf5Nwpf+6z/yj//P/54/YiN+GNVv2sHvr/gT//O/9+/Z1BVo+Jt18Pu7/0GRk8GXMF2/ee+l/+ov/wc3vfQpvfRbdLjwb/vb//O/8y/Y9BKY8LfsCyT3NDy/6Oq36gn/DW/tgeq/dYfqf/afftNbIOBvE771n/1df8x/8bf/nRvfAlv8th0M/7K/+r/42/7yjW9hXL9dh4R/+T/4X/2ZG9XTHmj423cw/Hv+5BuEaw9K43foUOPP/1NuGhdk5HfsqsI/+D//yzYrUPT1O4WC/J//EX/4f/Wn/Nn/+Z/4l2188YBe/J27gvx3ytAGeQov/YLOS3/G3/Rf/hkbpWsPb7Ht+11+nTlrRjFqv3XUqLE6u6HBbwhL9xvC0v0Gaul+q1/wYy+zi1y8uf1f8Fv//r8rXL3X1bqe5s7N++N+g5k4h1tsqn6r0Db+rkS4//rv/cP+8z/hH/zP/4g/7z/7e/5K6vq/+tv++v/6T/uD/vM/6Y/6z/7+P++//Mv+ARrlL7z7si6m+RfkyH6R1W9llPgtb2mo/8Wf/1f9F3/2n0pqdAYTKqP4bQnmf/kn/NH/5d/8t6bBM4O9lCbE9n/0f/n3/JUz2EL55NfffbCzM94BTzhL1/ni19WPfuy/+NP+DkJUSPPr6Ye/zt4OG1dD7B+zg5vByihq9sMeamYOEoLz6fbOwfbOQ0bvh0W13/y//LP//v/8b/oT/+u/56/Y+i/+7L/hv/6D/pw7Ab3+nj+uQ6+9nf04vfZ2HtyWXtAR3yy9fp0+vX478cD+8z/8jya9SpD+y7/lb/iv/9K/8D//m/8gIlO6kUzl/Kc9Av3Yf/G3/N3/1Z/45/2Xf/Pfv5mR9vcGGMl98V6E+Y18/D3aBOO6FXF+3Z894vz2wjr/5d/yR4cTFUjdp//lH/13/+d/1N9BXjSRrNtu+Am47uGnA1xnv3DEpd7+xD/5h0XcX+9nj7i/w3/xZ/85/+Xf/Pf9l3/V3/tf/U1/Z482G1hx79MBVtz7dO+2rOjrtG+OWr/+zx61flul1gb1v0+M+F/+9T1W3fz4hL330NCvQ1j3xc8RG/4GP4uEFe1HarBHGEtYip/eS7bx+IT99OEAx3560FeeP1TC/tjPHmFvML19sd45GDK95osbxfpnh0hJn0i/haQy/vN/4O+k8Om//Cv+7v/8L/wTiDy/+0byXGXLi+tieeHR6Nf7L/7OP4isi0eY30hoZT935NkdEE73hSXPf/5H/j3/+Z/wh2wmz68nuHuE+W3kk5CNN5DlN/xZI8utfN2e77Z7MGBF3Rc3MpBvFz6cQr9Rn0K/QLjvv/4j/+T/8m/5U0g2/rO/68/5L/68P/q//rP/pP/8D//r/vM/8e8iav34Rmpd/zTly6dxMfvP/r4/67/62/4qX8z2/+s/9E+mb+59+mkX5w1PwHX7Q7bWfnEjTWNC+ff+HRRpR4SSP+8gE6ftb/xzTdvkP/+7/qj/4i/6B3b2tpH28Sg2FEHYL36OKPab/FxT7Deg4Ju+4STK/wfo9Zv+cOkVNZJ/UNdI3huKFOwXP0fU+s361Ppd//O/7y/6z/+Gv+M//xP/0v/8j/iDKOdKngS5WeLg/Od/1x/yn/95yIt+tJFgP/jpSbny6PV1LcMAj0UsA2zn3/On3pQFMQPzSPbb2g97qMUp9pv/cCgGe0yQNgUO/8Vf9EcSq4Xf3/T49L2/M2Al3Bc3+reB5RWcP4S4v8UPhbi/+X/+l/2V/9Xf/EdswUH+U/9g0gY3pJf2H94fyKLYL37YhPot+4T6LWSV6b/88/7g//pP/wf/iz/rLyfiEGV+t82UKd/Nd/fuebT5rf+rP+Zv/i//tL/wP/t7/+AOHsPq7TfY3R/gJP8bS6H/+s/9O/7zP+1vuMmJ47F4FPptZFAheQYIdDCTZYyfFQLdRjIR0n9Q5Dkkme6LHzI5f+ufNXL+l3/PX0KmlMSxh8n/xzgOru2vQ4jITD08eDD69OEBcOYPfu39+ztmmZHXvM/tP78Wr93/5r82VjR+wW94XJbVFS1nUGw1/wW/AdY1Xhc/yH+tX/MX/IayrPGEFjNmF7Tq8RvMfs3f7Lf5NfDfb/jb/Dq/4Dc4aZqTMmuaX/c3yuq2mJb5718WTfsLfp3f/3d9/eTX+rWkK5pBYMMLMVjk//XcjDIN9ncf3jvY0S/dTH8Cn+nv/Tv/iz/9b/vP/v4/9b/44/7g//Kv+ltTuBl/3J/5X/xdf6PYdfr6v/o7/tj//O/7m4kF7sta0LGg8ZNFfiVc8AYrT7+Q/pFufuExL3ztPKT/7u3AG/+1gl5B3Xvbu3vbe/cE419rCOPdPsb/5T/wJxBL/dd/3h/5X/3xf/F/9Tf9oYR6+p//uX8/ZfCROP/D/pD/6m//m/7zP+Yf/C//jL/lZozvWYx33xvjX3sA4wcHm2n8X/69fxtorJqGCUwy8p//3X/4f/HH/HX/+R//R96Wxg8Obo3x7qeC8a8zhPHDHsZjIigt+RCZdcnnL//7/6u//69LhcBimckR/S//3r9eZuG2ZH7w8L2R/nWHkP60T+aApn/uX/Bf/Dl/Rkq5bPwhGAuZ/9a/5D//Y/6qW5P501tjvKOM8esNYfygjzGR+b/+s/9wIjOxB1bI/46/KmBlYpT/7O/66ymeuZkxHI0fvDfGv/4Qxvt9jIWV/+u/9s/6z/7+P54Y+r/8k/6IgJXB5n/R3/ef/X3QI7em8f4tMd7d3j0QjH+DIYzv9zAek7qg9eX//E/+Y/7zP+qv/s//sj9H/MOAzL4quTWZ77830mJa+Fe4eKS1jWn4dX6XH+tp/d8lCbX97/IbDmn6Xye03R/9F3/eX/ef/91/23/+9/wV5AP/l3/aX0VjJFH+r//IP3Hv/t7Bf/Z3/ZH/+Z8E7n88b9vVo7t3r6v1D+bFeLq8qMmsj6fV4u68XZR3dfTNXdhBSvc+vLu7v3//3qdjfBsQObSWD5UOHaQ+Pvgv/rw/anfvv/gz/3IQ/m+gtbm/7b/4M/+O//yv/7P+8z/kr5IZ+S//2r9gPB5/bcz2Ht4Gs1+7j9nWA8Lsv/ij/mTgxYj8p3/QH2wJ91/9g38ugpM/8Y/7z/+Sv+Dro3b/Nqj9On3U7vznf9zf85//hX/hf/UX/3H/xR/6h/+Pf9+fAxo+BAn/7n/wv/rL/8H//M/7m+h7cuk/hGwPdm+D26/7c4Pbzm1w+/XeCzfoqm8Ct09vxW6//s8Nbge3we03eC/cvqk5/fTBbXDzNOVvLb/9Nps861/vd/n157/Lb/Br/Zp44wYNOvt/AgAA//+cf8MQDy8AAA==',
                '__VIEWSTATE': '',
                #'__EVENTVALIDATION':,
                'ctl00$txtTitle': '',
            }

            content = process._downloader.downloader_wrapper(
                url,
                batch_id,
                gap,
                method='get',
                timeout=timeout,
                refresh=True,
                encoding='utf-8'
            )  #第一页使用get,得到ZIPSTATE参数,之后在循环内持续利用__EVENTTARGET参数翻页同时更新ZIPSTATE参数
            if content == '':
                return False
            while 1:
                dom = lxml.html.fromstring(content)  #开始解析当前页
                market_suffixes = dom.xpath(
                    '//a[contains(@href,"MarketId")]/@href')
                if not market_suffixes:
                    get_logger(batch_id, today_str,
                               '/opt/service/log/').info('No market_suffixes')
                    get_logger(batch_id, today_str,
                               '/opt/service/log/').info(content)
                    return False

                market_suffixes_set = set(
                    market_suffixes)  #去重,但是对于市场名重复延续到下一页的情况无效,会重复多爬一次
                market_url_list = [
                    urlparse.urljoin(process._pattern['market'], suffix)
                    for suffix in market_suffixes_set
                ]
                manager.put_urls_enqueue(
                    batch_id, market_url_list)  #完成本页的处理,将市场名入队,接下去的操作全是为了翻页

                page_label = dom.xpath('//td[@colspan="10"]//span')[
                    0]  #在所有页数里,只有当前页的标签是span,定位到当前页
                page = page_label.xpath('.//text()')
                if page == ['40']:
                    return True
                next_sibling_list = (
                    page_label.xpath('./following-sibling::a')
                )  #定位下一页,下一页不存在时则结束,(即使是网页上的...按钮,在此种判断里也会算存在下一页)
                if not next_sibling_list:
                    return True

                next_sibling = next_sibling_list[0]
                next_raw_js = next_sibling.xpath(
                    './@href'
                )[0]  # 其形式为 :   "javascript:__doPostBack('ctl00$ContentPlaceHolder1$DG_FullLatestPrice$ctl24$ctl01','')"
                eventtarget = re.findall('\(\'(.*)\',', next_raw_js)[0]
                data['__EVENTTARGET'] = eventtarget

                last = data[
                    '__ZIPSTATE']  #用来储存上一次的ZIPSTATE参数,如果新参数失败就换旧的使用——实践过程中发现某页的ZIPSTATE有小概率对下一页失效
                data['__ZIPSTATE'] = (
                    dom.xpath('//input[@name="__ZIPSTATE"]/@value'))[0]
                data['__EVENTVALIDATION'] = (dom.xpath(
                    '//input[@name="__EVENTVALIDATION"]/@value'))[0]  #更新参数

                for _ in range(0, 3):  #开始对下一页发请求,绝大多数失败都发生在这一步,慎重
                    try:
                        content = process._downloader.downloader_wrapper(
                            url,
                            batch_id,
                            gap,
                            method='post',
                            timeout=timeout,
                            refresh=True,
                            data=data,
                            encoding='utf-8')

                        if content == '' or 'sent a request that this server could not understand' in content or 'bad request' in content:
                            get_logger(
                                batch_id, today_str,
                                '/opt/service/log/').info('change ZIPSTATE')
                            get_logger(
                                batch_id, today_str,
                                '/opt/service/log/').info('change ZIPSTATE')
                            data[
                                '__ZIPSTATE'] = last  #使用上一次的参数,不考虑连续两次失败,实际调试中也没遇到过
                            continue
                    except Exception, e:
                        get_logger(batch_id, today_str,
                                   '/opt/service/log/').info(e)
                        continue
                    break
                else:
                    get_logger(batch_id, today_str,
                               '/opt/service/log/').info('failed 3 times')
                    return False

        elif label == 'market':  #开始处理市场页,同时在此处理价格历史,加入到产品信息生成结果
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('in market page')
            market_id = url[url.find('=') + 1:]
            url = url.replace(market_id, urllib.quote(market_id))
            content = process._downloader.downloader_wrapper(
                url,
                batch_id,
                gap,
                timeout=timeout,
                refresh=True,
                encoding='utf-8',
                redirect_check=False)

            dom = lxml.html.fromstring(content)
            title = dom.xpath('//a[@class="this_tab"]//text()')
            if title:
                title = title[0]
            result = {}
            result['market'] = title.strip()
            result['product_list'] = []

            table_node = dom.xpath('//table[@class="data_table"]')[0]  #得到产品表格
            products_nodes = table_node.xpath('.//tr')[1:-1]  #去掉表头和尾巴

            newest_time = None
            for product_node in products_nodes:  #市场页会有相同产品在不同时间的批次,以此为根据去重
                report_time = product_node.xpath('./td[9]/text()')
                if not newest_time:
                    newest_time = report_time
                if newest_time != report_time:
                    break

                relative_path = product_node.xpath('./td[10]/a/@href')[0]
                history_url = process._pattern['history'].format(relative_path)

                get_logger(batch_id, today_str, '/opt/service/log/').info(
                    'The history_url is :{}'.format(history_url))
                content = process._downloader.downloader_wrapper(
                    history_url,
                    batch_id,
                    gap,
                    timeout=timeout,
                    refresh=True,
                    encoding='utf-8')

                if content:  #有的价格历史会显示‘数据还在完善中‘
                    dom_history = lxml.html.fromstring(content)
                    date_list = dom_history.xpath('//series//value/text()')
                    price_list = dom_history.xpath('//graph//value/text()')
                    history_dic = dict(zip(date_list, price_list))
                else:
                    history_dic = {}

                product_item = {
                    'variety':
                    product_node.xpath('./td[1]/text()')[0].strip(),
                    'level':
                    product_node.xpath('./td[2]/text()')[0].strip(),
                    'price_type':
                    product_node.xpath('./td[5]/text()')[0].strip(),
                    'produce_year':
                    product_node.xpath('./td[6]/text()')[0].strip(),
                    'produce_area':
                    product_node.xpath('./td[7]/text()')[0].strip(),
                    'deliver_area':
                    product_node.xpath('./td[8]/text()')[0].strip(),
                    'source':
                    'http://datacenter.cngrain.com{}'.format(relative_path),
                    'access_time':
                    datetime.utcnow().isoformat(),
                    'price_history':
                    history_dic,
                }
                result['product_list'].append(product_item)

            result['market_source'] = url
            # print (json.dumps(result, ensure_ascii = False))
            return process._cache.post(url,
                                       json.dumps(result, ensure_ascii=False))
示例#18
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    home_page = 'http://qy1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=23&tableName=TABLE23&title=GMP%C8%CF%D6%A4&bcId=118715589530474392063703010776'
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process, '_reg'):
        setattr(
            process, '_reg', {
                'detail':
                re.compile(
                    'http://qy1.sfda.gov.cn/datasearch/face3/content.jsp\?tableId=23&tableName=TABLE23&tableView=GMP%C8%CF%D6%A4&Id=(\d+)'
                ),
            })

    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)
    gap = max(gap - other_batch_process_time, 0)

    today_str = datetime.now().strftime('%Y%m%d')

    data = {
        'tableId': '23',
        'State': '1',
        'bcId': '118715589530474392063703010776',
        'State': '1',
        'curstart': '4',
        'State': '1',
        'tableName': 'TABLE23',
        'State': '1',
        'viewtitleName': 'COLUMN152',
        'State': '1',
        'viewsubTitleName': 'COLUMN151',
        'State': '1',
        'tableView': 'GMP%E8%AE%A4%E8%AF%81',
        'State': '1',
    }
    if url == home_page:
        page = 1
        while 1:
            data['curstart'] = page
            content = process._downloader.downloader_wrapper(
                'http://qy1.sfda.gov.cn/datasearch/face3/search.jsp',
                batch_id,
                gap,
                method='post',
                timeout=timeout,
                refresh=True,
                data=data)
            ids = re.findall(u'GMP认证&Id=(\d+)', content)
            if not ids:
                break
            url_pattern = 'http://qy1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=23&tableName=TABLE23&tableView=GMP%C8%CF%D6%A4&Id={}'
            urls = []
            for drug_id in ids:
                url = url_pattern.format(drug_id)
                urls.append(url)
            manager.put_urls_enqueue(batch_id, urls)
            page += 1
        return True
    elif process._reg['detail'].match(url):
        content = process._downloader.downloader_wrapper(
            url,
            batch_id,
            gap,
            timeout=timeout,
        )
        if content == '':
            return False
        dom = lxml.html.fromstring(content)
        table = dom.xpath('//tr')

        item = {'source': url, 'access_time': datetime.utcnow().isoformat()}
        tr_labels = dom.xpath('//tr')
        for tr_label in tr_labels[1:]:
            key = ''.join(tr_label.xpath('.//td[1]//text()')).strip()
            value = ''.join(tr_label.xpath('.//td[2]//text()')).strip()
            if value and key != u'注':
                item[key] = value
        return process._cache.post(url, json.dumps(item, ensure_ascii=False))
示例#19
0
def process(url, batch_id, parameter, manager, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader',
                DownloadWrapper(None, headers, REGION_NAME))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CacheS3(head + '-json-' + tail))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'entity':
                re.compile(
                    urlparse.urljoin(SITE,
                                     'cndbpedia/api/entity\?mention=(.+)')),
                'avp':
                re.compile(
                    urlparse.urljoin(SITE,
                                     'cndbpedia/api/entityAVP\?entity=(.+)')),
                'info':
                re.compile(
                    urlparse.urljoin(
                        SITE, 'cndbpedia/api/entityInformation\?entity=(.+)')),
                'tags':
                re.compile(
                    urlparse.urljoin(SITE,
                                     'cndbpedia/api/entityTag\?entity=(.+)')),
            })

    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     0,
                                                     timeout=10,
                                                     encoding='utf-8')

    if content == '':
        return False

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue

        entity = urllib.unquote(m.group(1))
        if label == 'entity':
            urls = []
            avpair_api = urlparse.urljoin(SITE,
                                          'cndbpedia/api/entityAVP?entity={}')
            info_api = urlparse.urljoin(
                SITE, 'cndbpedia/api/entityInformation?entity={}')
            tags_api = urlparse.urljoin(SITE,
                                        'cndbpedia/api/entityTag?entity={}')
            js = json.loads(content)
            for ent in js[u'entity']:
                if isinstance(ent, unicode):
                    ent = ent.encode('utf-8')
                ent = urllib.quote(ent)
                urls.append(avpair_api.format(ent))
                urls.append(info_api.format(ent))
                urls.append(tags_api.format(ent))

            return urls
        else:
            data = json.dumps({entity: json.loads(content)})
            return process._cache.post(url, data)
示例#20
0
class Scheduler(object):
    def __init__(self, cacheserver):
        self.cache = Cache(BATCH_ID['json'], cacheserver)
        self.downloader = DownloadWrapper(cacheserver,
                                          {'Host': 'zhidao.baidu.com'})

    @classmethod
    def instance(cls, *args):
        if not hasattr(cls, '_instance'):
            setattr(cls, '_instance', cls(*args))
        return cls._instance

    def zhidao_results(self, qids, gap, timeout=10):
        q_jsons = []
        for qid in qids:
            q_json = self.zhidao_question(qid, gap, timeout)
            if q_json is False:
                continue
            q_json['list_answers'] = []

            for rid in q_json['answer_ids'][:3]:
                a_json = self.zhidao_answer(qid, rid, gap, timeout)
                if a_json is False:
                    continue
                q_json['list_answers'].append(a_json)

            q_jsons.append(q_json)
        return q_jsons

    def zhidao_question(self, qid, gap, timeout):
        question_url = 'http://zhidao.baidu.com/question/{}.html'.format(qid)
        ret = self.downloader.downloader_wrapper(question_url,
                                                 BATCH_ID['question'],
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 error_check=True)
        if ret is False:
            return False
        q_json = generate_question_json(qid, ret)
        if q_json is None:
            return False
        success = self.cache.post(question_url, q_json)
        return q_json

    def zhidao_answer(self, qid, rid, gap, timeout):
        answer_url = ('http://zhidao.baidu.com/question/api/mini?qid={}'
                      '&rid={}&tag=timeliness'.format(qid, rid))

        ret = self.downloader.downloader_wrapper(answer_url,
                                                 BATCH_ID['answer'],
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030')
        if ret is False:
            return False
        try:
            a_json = generate_answer_json(ret)
        except:
            return False

        success = self.cache.post(answer_url, a_json)
        return a_json

    def zhidao_search(self, qword, batch_id, gap=3, timeout=10, refresh=True):
        quote_word = urllib.quote(qword.encode('utf-8')) if isinstance(
            qword, unicode) else urllib.quote(qword)
        # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8
        query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word)

        ret = self.downloader.downloader_wrapper(query_url,
                                                 batch_id,
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 refresh=refresh)
        # resp.headers: 'content-type': 'text/html;charset=UTF-8',
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False
        return zhidao_search_questions(ret)

    def zhidao_search_list_json(self,
                                qword,
                                batch_id,
                                gap=3,
                                timeout=10,
                                refresh=False):
        quote_word = urllib.quote(qword.encode('utf-8')) if isinstance(
            qword, unicode) else urllib.quote(qword)
        # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8
        query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word)

        ret = self.downloader.downloader_wrapper(query_url,
                                                 batch_id,
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 refresh=refresh)
        # resp.headers: 'content-type': 'text/html;charset=UTF-8',
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False

        search_result_json = parse_search_json_v0615(ret)
        for item in search_result_json:
            item["query"] = qword
            if type(qword) != unicode:
                item["query"] = qword.decode("utf-8")

        return search_result_json

    def zhidao_search_select_best(self, qword, gap=3, timeout=2):
        search_result_json = self.zhidao_search_list_json(
            qword, BATCH_ID['search'], gap, timeout)

        # get the best answer
        for item in search_result_json:
            if item["is_recommend"] == 1:
                return item

        return False

    def zhidao_search_select_best_qids(self, qword, gap=3, timeout=2):
        ret = self.zhidao_search_select_best(qword, gap, timeout)
        if ret:
            return [ret["question_id"]]
        return []

    def run(self, qword, gap=3, timeout=10):
        # qids = self.zhidao_search(qword, BATCH_ID['search'], gap, timeout)
        qids = self.zhidao_search_select_best_qids(qword, gap, timeout)
        return self.zhidao_results(qids, gap, timeout)
示例#21
0
 def __init__(self, cacheserver):
     self.cache = Cache(BATCH_ID['json'], cacheserver)
     self.downloader = DownloadWrapper(cacheserver,
                                       {'Host': 'zhidao.baidu.com'})
示例#22
0
class ZhidaoFetch():
    def __init__(self, config={}):
        self.debug = config.get("debug")
        self.api_nlp = ZhidaoNlp(self.debug)
        self.config = config
        if config:
            from downloader.downloader_wrapper import DownloadWrapper
            print self.config
            self.downloader = DownloadWrapper(
                self.config.get("cache_server"),
                self.config["crawl_http_headers"])

    def parse_query(self, query_unicode, query_parser=0):
        if query_parser == 1:
            qword = u" ".join(self.api_nlp.cut_text(query_unicode))
        else:
            qword = query_unicode

        return qword

    def get_search_url_qword(self,
                             query_unicode,
                             query_parser=0,
                             page_number=0):
        qword = self.parse_query(query_unicode, query_parser=query_parser)

        if page_number == 0:
            query_url = "http://zhidao.baidu.com/search/?word={0}".format(
                urllib.quote(qword.encode("utf-8")))
        else:
            query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format(
                page_number * 10, urllib.quote(query))

        return query_url, qword

    def select_best_qapair_0616(self, search_result_json):
        for item in search_result_json:
            if item["is_recommend"] == 1:
                #Thread(target = post_zhidao_fetch_job, args = (item, ) ).start()
                ret["best_qapair"] = item
                return ret

    def select_top_n_chat_0621(self, query, search_result_json,
                               num_answers_needed):

        good_answers = []
        bad_answers = []
        result_answers = []

        match_score_threshold = 0.6

        for item in search_result_json:
            #print type(query), type(item["question"])
            discount_skip_word = 0
            if self.api_nlp.detect_skip_words(item["question"]):
                print "did not skip min-gan-ci question"
                # continue

            if self.api_nlp.detect_skip_words(item["answers"]):
                print "did not skip min-gan-ci answers"
                # continue

            match_score = difflib.SequenceMatcher(None, query,
                                                  item["question"]).ratio()
            item["match_score"] = match_score

            if self.api_nlp.get_answer_filter_word(item["answers"]):
                bad_answers.append(item)
            else:
                good_answers.append(item)

        for item in sorted(good_answers,
                           key=lambda elem: 0 - elem["match_score"]):
            match_score = item["match_score"]
            if match_score >= match_score_threshold and len(
                    result_answers) < num_answers_needed:
                result_answers.append(item)
            else:
                break

        if len(result_answers) < num_answers_needed:
            for item in sorted(bad_answers,
                               key=lambda elem: 0 - elem["match_score"]):
                match_score = item["match_score"]
                if match_score >= match_score_threshold and len(
                        result_answers) < num_answers_needed:
                    result_answers.append(item)
                else:
                    break

        return result_answers

    def select_top_n_chat_0622(self,
                               query,
                               search_result_json,
                               result_limit=3,
                               answer_len_limit=30,
                               question_len_limit=20,
                               question_match_limit=0.4):
        result_answers = []

        for item in search_result_json:
            if "answers" not in item:
                continue

            #skip long answers
            if len(item["answers"]) > answer_len_limit:
                #print "skip answer_len_limit", type(item["answers"]), len(item["answers"]), item["answers"]
                continue

            #too long question
            if len(item["question"]) > question_len_limit:
                #print "skip question_len_limit", len(item["question"])
                continue

            if self.api_nlp.filter_chat(item["question"], item["answers"]):
                continue

            question_match_score = difflib.SequenceMatcher(
                None, query, item["question"]).ratio()
            #            question_match_score_b = difflib.SequenceMatcher(None,  item["question"], query).ratio()
            item["match_score"] = question_match_score
            item["label"] = self.api_nlp.get_chat_label(
                item["question"], item["answers"])

            #skip not matching questions
            if (question_match_score < question_match_limit):
                #print "skip question_match_limit", question_match_score
                continue

            result_answers.append(item)

        ret = sorted(result_answers, key=lambda x: 0 - x["match_score"])
        if len(ret) > result_limit:
            ret = ret[:result_limit]
        return ret

    def search_chat_top_n(self,
                          query,
                          num_answers_needed=3,
                          query_filter=2,
                          query_parser=0,
                          select_best=True):
        result = self.prepare_query(query,
                                    query_filter,
                                    query_parser,
                                    use_skip_words=False)
        if not result:
            return False

        ret = result["ret"]
        query_url = result["query_url"]
        query_unicode = ret["query"]
        #if self.api_nlp.is_question_baike( query_unicode , query_filter= query_filter):
        #    print "not skip query, baike", query_filter,  query_unicode
        # return False
        #print query

        ts_start = time.time()
        content = self.download(query_url)

        ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000)
        if content:
            ret["content_len"] = len(content)
            #print type(content)
            #print content

        if select_best and content:
            ts_start = time.time()
            search_result = parse_search_json_v0707(content)
            search_result_json = search_result["results"]
            ret["milliseconds_parse"] = int((time.time() - ts_start) * 1000)
            ret["item_len"] = len(search_result_json)

            answer_items = self.select_top_n_chat_0622(query_unicode,
                                                       search_result_json,
                                                       num_answers_needed)
            #print "select_best", len(answer_items)
            ret["items"] = answer_items
            ret["results"] = search_result_json
            ret["total"] = search_result["total"]
            # if answer_items:
            #     index = 0
            #     for item in answer_items:
            #         ret ["qapair{}".format(index)] = item
            #         index += 1
            #     return ret
            #print json.dumps(search_result_json,ensure_ascii=False)

        return ret

    # def text2bigram(self, text):
    #     ret = set()
    #     if not text:
    #         return ret
    #     text = text.lower()
    #     symbols = list(self.api_nlp.cut_text(text))
    #
    #     for i in range(len(symbols)):
    #         if i==0:
    #             word = u'___{}'.format(symbols[i])
    #             ret.add(word)
    #             word = text[i:i+2]
    #             ret.add(word)
    #         elif i == len(text)-1:
    #             word = u'{}___'.format(symbols[i])
    #             ret.add(word)
    #         else:
    #             word = u"".join(symbols[i:i+2])
    #             ret.add(word)
    #     return ret
    #
    # def bigram_sim(self, q1, q2):
    #     b1 = self.text2bigram(q1)
    #     b2 = self.text2bigram(q2)
    #     b1 = set(self.api_nlp.cut_text(q1.lower()))
    #     b2 = set(self.api_nlp.cut_text(q2.lower()))
    #     b1d = set(b1)
    #     b1d.difference_update(b2)
    #
    #     sim = 1.0 * len(b1.intersection(b2))/ len(b1.union(b2))
    #     return sim
    def sim(self, q1, q2):
        q1 = self.api_nlp.clean_question(q1)
        q2 = self.api_nlp.clean_question(q2)
        match_score = difflib.SequenceMatcher(None, q1, q2).ratio()
        return match_score

    def select_best_qapair_0630(self,
                                query,
                                search_result_json,
                                question_len_max=30,
                                answer_len_max=90,
                                answer_len_min=2):
        best_item = None
        best_score = 0.6
        best_cnt_like = -1
        used_skip_sources = list()
        for item in search_result_json:
            print json.dumps(item, ensure_ascii=False)
            print "\n\n--------select_best_qapair_0630 "

            if item["source"] in ["muzhi"]:
                used_skip_sources.append(item["source"])

                item["debug_note"] = u"[-]问答对-来自拇指"
                continue

            #match_score = self.bigram_sim(query, item["question"])
            match_score = self.sim(query, item["question"])
            item["match_score"] = match_score

            #print type(query), type(item["question"])
            temp = self.api_nlp.detect_skip_words(item["question"])
            if temp:
                print "skip min-gan-ci question", json.dumps(
                    list(temp), ensure_ascii=False)
                item["debug_note"] = u"[-]问答对-问题敏感词:{}".format(u"/".join(temp))
                continue

            temp = self.api_nlp.detect_skip_words(
                item["answers"],
                check_list=["skip_words_zhidao", "skip_words_all"])
            if temp:
                print "skip min-gan-ci answers", json.dumps(list(temp),
                                                            ensure_ascii=False)
                item["debug_note"] = u"[-]问答对-答案敏感词:{}".format(u"/".join(temp))
                continue

            #too long question
            #if len(item["question"]) > question_len_max:
            #    item["debug_note"]= u"[-]问题长度过长:{}".format(len(item["question"]) )
            #    continue

            if len(item["answers"]) < answer_len_min:
                item["debug_note"] = u"[-]答案长度过短:{}".format(
                    len(item["answers"]))
                continue

            filter_word = self.api_nlp.get_answer_filter_word(item["answers"])
            if filter_word:
                print "skip bad answers"
                item["debug_note"] = u"[-]问答对-答案有符号:{}".format(filter_word)
                continue

            if self.api_nlp.debug:
                print match_score, item["answers"]

            #print query, item["question"] ,match_score, item["cnt_like"]
            this_answer_is_better = False
            if item["source"] == "baike":
                item["debug_note"] = u"[+]问答对-使用百科"
                this_answer_is_better = True
            elif not best_item or best_item["source"] != "baike":
                #skip long answers
                #if len(item["answers"]) > answer_len_max and item["cnt_like"] < 50:
                #    item["debug_note"]= u"[-]答案长度过长:{}".format(len(item["answers"]) )
                #    continue

                if match_score > best_score and item[
                        "cnt_like"] >= best_cnt_like * 0.2:
                    this_answer_is_better = True
                elif match_score > best_score * 0.95 and item[
                        "cnt_like"] > best_cnt_like * 1.5 + 2:
                    this_answer_is_better = True

            if this_answer_is_better:
                best_item = item
                best_score = max(match_score, best_score)
                best_cnt_like = item["cnt_like"]
                if not item.get("debug_note"):
                    item["debug_note"] = u"[?]问答对-maybe best={}".format(
                        best_score)
            else:
                if not item.get("debug_note"):
                    item["debug_note"] = u"[-]问答对-低于best={}".format(best_score)

        if best_item and best_item["source"] not in [
                "baike"
        ] and len(used_skip_sources) >= 4:
            if best_item:
                best_item["debug_note"] += u"--规避医疗类问题{}".format(
                    "/".join(used_skip_sources))
            #母婴类,医疗类问题不能给出答案,要专业人士做这件事
            return None

        return best_item

    def search_baike_best(self,
                          query,
                          query_filter=2,
                          query_parser=0,
                          debug_item=None,
                          keep_result=False):
        query_unicode = query
        if not isinstance(query, unicode):
            query_unicode = query.decode("utf-8")

        query_unicode = self.api_nlp.rewrite_zhidao_query(query_unicode)
        result = self.prepare_query(query_unicode,
                                    query_filter,
                                    query_parser,
                                    debug_item=debug_item)
        if not result:
            return False

        ret = result["ret"]
        result["query"] = query
        query_url = result["query_url"]
        if not self.api_nlp.is_question_baike(query_unicode,
                                              query_filter=query_filter,
                                              debug_item=debug_item):
            print "skip query, not baike", query_filter, query_unicode
            return False

        ts_start = time.time()
        content = self.download(query_url)

        ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000)

        if content:
            ts_start = time.time()
            search_result = parse_search_json_v0707(content)
            search_result_json = search_result["results"]
            ret["total"] = search_result["total"]
            ret["milliseconds_parse"] = int((time.time() - ts_start) * 1000)
            if keep_result or self.debug:
                ret["results"] = search_result_json

            best_item = self.select_best_qapair_0630(query_unicode,
                                                     search_result_json)
            if best_item:
                ret["best_qapair"] = best_item
                return ret
            #print json.dumps(search_result_json,ensure_ascii=False)

        #print ">>>>>>", content
        return ret

    def search_all(self, query, query_filter=0, query_parser=0, limit=10):
        max_page_number = (limit - 1) / 10 + 1
        output = {
            "items": [],
            "metadata": [],
            "query": query,
            "limit": limit,
            "query_filter": query_filter,
            "query_parser": query_parser
        }
        for page_number in range(max_page_number):
            result = self.prepare_query(query,
                                        query_filter,
                                        query_parser,
                                        use_skip_words=False)

            if not result:
                print query
                break

            ret = result["ret"]
            query_url = result["query_url"]
            query_unicode = ret["query"]

            ts_start = time.time()
            content = self.download(query_url)

            ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000)

            if content:
                ts_start = time.time()
                search_result = parse_search_json_v0707(content)
                ret["milliseconds_parse"] = int(
                    (time.time() - ts_start) * 1000)
                output["items"].extend(search_result["results"])
                output["metadata"].extend(ret)
                output["total"] = search_result["total"]

        return output

    def prepare_query(self,
                      query,
                      query_filter,
                      query_parser,
                      use_skip_words=True,
                      page_number=0,
                      debug_item=None):
        if not query:
            print "skip query, empty"
            if debug_item is not None:
                debug_item["debug_note"] = u"[-]问题空:prepare_query"
            return False

        query_unicode = query
        if not isinstance(query_unicode, unicode):
            query_unicode = query_unicode.decode("utf-8")

        if use_skip_words:
            detected_words = self.api_nlp.detect_skip_words(query_unicode)
            if detected_words:
                print "skip bad query, empty", u"/".join(detected_words)
                if debug_item is not None:
                    debug_item["debug_note"] = u"[-]问题敏感词:{}".format(
                        u"/".join(detected_words))
                return False

        query_unicode = re.sub(u"?$", "", query_unicode)
        query_url, qword = self.get_search_url_qword(query_unicode,
                                                     query_parser,
                                                     page_number=page_number)

        ret = {
            "query": query_unicode,
        }

        if query_parser == 1:
            ret["qword"] = qword

        return {"ret": ret, "query_url": query_url}

    def search_chat_best(self, query, query_filter=2, query_parser=0):

        result = self.prepare_query(query, query_filter, query_parser)
        if not result:
            return False

        ret = result["ret"]
        query_url = result["query_url"]
        query_unicode = ret["query"]
        if not self.api_nlp.is_question_baike(query_unicode,
                                              query_filter=query_filter):
            print "skip query, not baike", query_filter, query_unicode
            return False

        ts_start = time.time()
        content = self.download(query_url)
        ret["milliseconds_fetch"] = int((time.time() - ts_start) * 1000)

        if content:
            ts_start = time.time()
            search_result = parse_search_json_v0707(content)
            search_result_json = search_result["results"]
            ret["total"] = search_result["total"]
            ret["milliseconds_parse"] = int((time.time() - ts_start) * 1000)

            #deprecated
            best_item = self.select_best_chat_0621(query_unicode,
                                                   search_result_json)
            if best_item:
                ret["best_qapair"] = best_item
                return ret
            #print json.dumps(search_result_json,ensure_ascii=False)

        return False

    def download(self, query_url):
        if self.config:
            return self.downloader.download_with_cache(
                query_url,
                self.config["batch_id"],
                self.config["crawl_gap"],
                self.config["crawl_http_method"],
                self.config["crawl_timeout"],
                encoding='gb18030',
                redirect_check=True,
                error_check=False,
                refresh=False)
        else:
            return self.download_direct(query_url)

    def download_direct(self, query_url):
        import requests
        #print query_url
        encoding = 'gb18030'
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, sdch',
            'Accept-Language':
            'zh-CN,en-US;q=0.8,en;q=0.6',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Upgrade-Insecure-Requests':
            1,
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
        }
        headers["Host"] = "zhidao.baidu.com"

        print query_url
        r = requests.get(query_url, timeout=10, headers=headers)

        if r:
            r.encoding = encoding
            return r.text
示例#23
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        cookie = kwargs.get('cookie', None)
        # cookie = "gr_user_id=0fceb70d-e0ab-4c16-8f21-d49b5d242b0e; PHPSESSID=ltro2cjbvonlg6mu4hupe7dcv1; CNZZDATA1254842228=371101890-1469690209-null%7C1472547698"

        if cookie:
            headers.update({'Cookie': cookie})
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'search':
                re.compile(
                    urlparse.urljoin(SITE,
                                     'search\?key=(.+?)&index=(\d+)&p=(\d+)')),
                'detail':
                re.compile(
                    urlparse.urljoin(
                        SITE,
                        'company_getinfos\?unique=(.+?)&companyname=(.+?)&tab=base'
                    )),
                'invest':
                re.compile(
                    urlparse.urljoin(
                        SITE,
                        'company_getinfos\?unique=(.+?)&companyname=(.+?)(?:&p=(\d+))?&tab=touzi(?:&box=touzi)?'
                    )),
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout = int(timeout)
    today_str = datetime.now().strftime('%Y%m%d')

    # if kwargs and kwargs.get("debug"):
    #     get_logger(batch_id, today_str, '/opt/service/log/').info('start download')
    def reformat(info):  # 将info按企查查页面顺序插入队列
        temp = info['info']
        del info['info']
        info['info'] = []
        info['info'].append(("统一社会信用码", temp['unified_social_credit_code']))
        info['info'].append(("注册号", temp['registration_id']))
        info['info'].append(("组织机构代码", temp['organization_code']))
        info['info'].append(("经营状态", temp['status']))
        info['info'].append(("公司类型", temp['business_type']))
        info['info'].append(("成立日期", temp['begin']))
        info['info'].append(("法定代表", temp['legal_person']))
        info['info'].append(("注册资本", temp['registered_capital']))
        info['info'].append(("营业期限", temp['end']))
        info['info'].append(("登记机关", temp['registration_authority']))
        info['info'].append(("发照日期", temp['approval_date']))
        info['info'].append(("企业地址", temp['address']))
        info['info'].append(("经营范围", temp['business_scope']))
        return info

    def parse_company_investment(tree):  # 解析对外投资页面,将子公司存入sub_companies字段下
        invest_dict = {'sub_companies': []}
        for sub_company in tree.cssselect('.list-group a.list-group-item'):
            sub_name = sub_company.cssselect(
                'span.clear .text-lg')[0].text_content().strip()
            href = sub_company.get('href')
            province, key_num = href.rsplit('_', 2)[-2:]
            invest_dict['sub_companies'].append({
                'name': sub_name,
                'key_num': key_num,
                'province': province,
                'href': href,
            })
        return invest_dict

    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     method,
                                                     timeout=timeout,
                                                     encoding='utf-8')
    # print(url, file=log_file)

    cookie = kwargs.get('cookie', None)
    if not cookie:
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info("No cookie in worker")
        return False

    if content == '':
        get_logger(batch_id, today_str, '/opt/service/log/').info("no content")
        content = requests.get(url, cookies={1: cookie}).text
        if content:
            # print("got content", file=log_file)
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('got content')
        if not content and url.endswith("tab=touzi&box=touzi"):
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info("void invest page")
            return True

    invest_pat = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&p={p}&tab=touzi&box=touzi"
    main_pat = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&tab=base"
    search_pat = "http://www.qichacha.com/search?key={name}&index=0&p={p}"
    parser = QiParser()
    tree = lxml.html.fromstring(
        content.replace('<em>', '').replace('</em>', ''))

    # if kwargs and kwargs.get("debug"):
    # print('start parsing url')

    for label, reg in process._regs.iteritems():
        m = reg.match(url)

        if not m:
            continue

        if label == 'search':  # 搜索页面解析
            comp_name = urllib.unquote(m.group(1))
            dic = {'search_name': comp_name, 'names': []}
            urls = []
            if tree.cssselect('.table-search-list') and tree.cssselect(
                    '.tp2_tit a'):
                items = tree.cssselect('.table-search-list')
                for idx, i in enumerate(items):
                    if not i.xpath('.//*[@class=\"tp2_tit clear\"]/a/text()'):
                        continue
                    item = {}
                    item['name'] = i.xpath(
                        './/*[@class=\"tp2_tit clear\"]/a/text()')[0]
                    # print(item['name'], file=log_file)
                    item['href'] = i.xpath(
                        './/*[@class=\"tp2_tit clear\"]/a/@href')[0]
                    item['status'] = i.xpath(
                        './/*[@class=\"tp5 text-center\"]/a/span/text()')[0]
                    item['key_num'] = item['href'].split('firm_')[1].split(
                        '.shtml')[0]
                    # print(item['key_num'], file=log_file)
                    if idx == 0 and comp_name == item[
                            'name']:  # 若第一个搜索结果完全匹配则只添加第一个结果入待爬取队列
                        # get_logger(batch_id, today_str, '/opt/service/log/').info('appending', item['name'])
                        urls.append(
                            main_pat.format(key_num=item['key_num'],
                                            name=item['name']))
                        urls.append(
                            invest_pat.format(key_num=item['key_num'],
                                              name=item['name'],
                                              p='1'))
                        break
                    elif idx < 3:  # 如果第一个不完全匹配, 将前三个搜索结果加入待爬取队列
                        urls.append(
                            main_pat.format(key_num=item['key_num'],
                                            name=item['name']))
                        urls.append(
                            invest_pat.format(key_num=item['key_num'],
                                              name=item['name'],
                                              p='1'))
                        dic['names'].append(item['name'])
            if not urls:
                return True
            manager.put_urls_enqueue(batch_id, urls)
            if not dic['names']:
                return True
            else:  # 不完全匹配时将search_name与前三个搜索结果存入json用作别名映射
                data = json.dumps(dic, encoding='utf-8', ensure_ascii=False)
                return process._cache.post(url, data)

        elif label == 'detail':  # 解析详情页面
            comp_name = urllib.unquote(m.group(2))
            # print(comp_name, 'detail', file=log_file)
            all_info = parser.parse_detail(tree)
            all_info['name'] = comp_name
            all_info['source'] = url
            all_info['access_time'] = datetime.utcnow().isoformat()
            all_info = parser.parser_patch(tree, all_info)
            all_info = reformat(all_info)
            data = json.dumps(all_info, encoding='utf-8', ensure_ascii=False)
            get_logger(batch_id, today_str, '/opt/service/log/').info(data)
            if not any([i[1] for i in all_info['info']]):
                return False
            return process._cache.post(url, data)

        else:  # 解析投资页面
            comp_name = urllib.unquote(m.group(2))
            key_num = m.group(1)
            page = int(m.group(3))
            pages = tree.xpath(".//a[@id=\"ajaxpage\"]/text()")
            if '>' in pages:
                urls = [
                    invest_pat.format(key_num=key_num,
                                      name=comp_name,
                                      p=str(page + 1))
                ]
                manager.put_urls_enqueue(batch_id, urls)
            invest_dict = parse_company_investment(tree)
            # print(invest_dict, file=log_file)
            if not invest_dict['sub_companies']:
                return True
            invest_dict['name'] = comp_name
            invest_dict['source'] = url
            invest_dict['access_time'] = datetime.utcnow().isoformat()
            data = json.dumps(invest_dict,
                              encoding='utf-8',
                              ensure_ascii=False)
            get_logger(batch_id, today_str, '/opt/service/log/').info(data)
            return process._cache.post(url, data)
示例#24
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'main':
                re.compile(
                    r'http://jiage.cngold.org/jinshubi/list_3640_(\d+).html'),
                'info':
                re.compile(
                    r'http://jiage.cngold.org/c/(\d+-\d+-\d+)/c(\d+).html'),
                'index':
                re.compile(r'http://jiage.cngold.org/jinshubi/index.html')
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout = int(timeout)
    today_str = datetime.now().strftime('%Y%m%d')
    if url == 'http://jiage.cngold.org/jinshubi/list_3640_1.html':
        url = 'http://jiage.cngold.org/jinshubi/index.html'
    # if kwargs and kwargs.get("debug"):
    #     get_logger(batch_id, today_str, '/opt/service/log/').info('start download')
    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     timeout=timeout)
    # print(content)
    if content == '':
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info(url + ' no content')
        return False

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            # print("not match")
            continue
        page = etree.HTML(content)

        if label == 'index':
            prices = page.xpath(".//ul[@class='list_baojia']/li/a/@href")
            # get_logger(batch_id, today_str, '/opt/service/log/').info(str(prices))
            manager.put_urls_enqueue(batch_id, prices[:3])

            return True

        elif label == 'info':
            dic = {}
            datestr = m.group(1)
            table = page.xpath(".//table//td/text()")
            table = [t.strip() for t in table]
            dic[u'产品名称'] = table[0]
            dic[u'产品价格'] = table[1]
            dic[u'价格单位'] = table[2]
            dic[u'涨跌'] = table[3]
            dic[u'日期'] = datestr
            dic[u'source'] = url
            dic[u'access_time'] = datetime.utcnow().isoformat()
            data = json.dumps(dic, ensure_ascii=False)
            # get_logger(batch_id, today_str, '/opt/service/log/').info(data)
            return process._cache.post(url, data)
示例#25
0
# -*- coding: utf-8 -*-
import scrapy
import sys

from downloader.downloader_wrapper import DownloadWrapper
reload(sys)
sys.setdefaultencoding('utf-8')
BATCH_ID = 'dongfang-201606test'
url = 'http://data.eastmoney.com/Notice'
SERVER = 'http://192.168.1.179:8000/'
m = DownloadWrapper(SERVER)


#content = m.downloader_wrapper('http://data.eastmoney.com/Notice/Noticelist.aspx',BATCH_ID,0,encoding='gb2312',refresh=True)
#print content
class MyMiddleWare(object):
    def process_request(self, request, spider):
        url = request.url
        m = DownloadWrapper(SERVER)
        content = m.downloader_wrapper(url, BATCH_ID, 3, encoding='gb2312')
        if content:
            response = scrapy.http.response.html.HtmlResponse(url,
                                                              encoding='utf-8',
                                                              body=content)
            return response
        return


'''
m=Cache(BATCH_ID)
print m.post('test','content3')
示例#26
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'main':
                re.compile(
                    r'http://chem.100ppi.com/price/plist-(\d+)(-{1,3})(\d+).html'
                ),
                'prd':
                re.compile(r'http://www.100ppi.com/price/detail-(\d+).html')
            })

    def safe_state(statement):
        return statement[0] if statement else ''

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout = int(timeout)
    today_str = datetime.now().strftime('%Y%m%d')
    # print(url)
    # if kwargs and kwargs.get("debug"):
    #     get_logger(batch_id, today_str, '/opt/service/log/').info('start download')
    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     timeout=timeout)
    # print(content)
    if content == '':
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info(url + ' no content')
        return False

    # content.encoding='gb18030'
    # if kwargs and kwargs.get("debug"):
    # get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url')

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        page = etree.HTML(content)

        if label == 'main':
            # print("adding chem prds")
            prd_links = page.xpath('//table/tr/td[1]/div/a/@href')
            if not prd_links:
                # print('end of pages')
                get_logger(batch_id, today_str,
                           '/opt/service/log/').info('end of pages')
                return True

            next_pat = re.compile(r'plist-(\d+)(-{1,3})(\d+).html')
            current = next_pat.search(url)
            current = str(int(current.group(3)) + 1)
            next_page = url[:url.rfind('-') + 1] + current + '.html'

            prd_links.append(urlparse.urljoin(SITE, next_page))
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('||'.join(prd_links) +
                                                 ' added to queue')
            manager.put_urls_enqueue(batch_id, prd_links)
            return True

        else:
            data = {}
            data['name'] = page.xpath(
                "/html/body/div[8]/div[1]/span[2]/text()")[0]
            # print(data['name'], 'prd page')
            data['source'] = url
            # data['prd_header'] = page.xpath("//div[@class=\"mb20\"]/table/tr/th/text()")
            # data['prd_infos'] = page.xpath("//div[@class=\"mb20\"]/table/tr/td/text()")
            prd_header = page.xpath(
                "/html/body/div[8]/div[2]/div[1]/div[1]/h3/text()")[0]
            idx_left, idx_right = prd_header.find(u'('), prd_header.find(u')')
            data[u'报价类型'] = prd_header[idx_left + 1:idx_right]
            data[u'报价机构'] = page.xpath(
                "/html/body/div[8]/div[2]/div[2]/div[2]/table/tr[1]/td/h3/text()"
            )[0].strip()
            data[u'商品报价'] = safe_state(
                page.xpath("//div[@class=\"mb20\"]/table/tr[1]/td[1]/text()"))
            data[u'发布时间'] = safe_state(
                page.xpath("//div[@class=\"mb20\"]/table/tr[1]/td[2]/text()"))
            data[u'出产地'] = safe_state(
                page.xpath("//div[@class=\"mb20\"]/table/tr[2]/td[1]/text()"))
            data[u'有效期'] = safe_state(
                page.xpath("//div[@class=\"mb20\"]/table/tr[2]/td[2]/text()"))
            data[u'仓储地'] = safe_state(
                page.xpath("//div[@class=\"mb20\"]/table/tr[3]/td[1]/text()"))
            data[u'包装说明'] = safe_state(
                page.xpath("//div[@class=\"mb20\"]/table/tr[3]/td[2]/text()"))
            data[u'生产厂家'] = safe_state(
                page.xpath(
                    "/html/body/div[8]/div[2]/div[1]/div[2]/div/div[2]/text()")
            )

            info = {}
            table_header = page.xpath(
                "//table[@class=\"mb20 st2-table tac\"]/tr/th/text()")
            table_content = page.xpath(
                "//table[@class=\"mb20 st2-table tac\"]/tr/td/text()")
            for header, cont in zip(table_header, table_content):
                info[header] = cont
            data[u'详细信息'] = info

            contact = {}
            contact[u'联系人'] = safe_state(
                page.xpath(
                    "//div[@class=\"connect\"]/table/tr[2]/td[2]/text()"))
            contact[u'电话'] = safe_state(
                page.xpath(
                    "//div[@class=\"connect\"]/table/tr[3]/td[2]/text()"))
            contact[u'传真'] = safe_state(
                page.xpath(
                    "//div[@class=\"connect\"]/table/tr[4]/td[2]/text()"))
            contact[u'邮件'] = safe_state(
                page.xpath(
                    "//div[@class=\"connect\"]/table/tr[5]/td[2]/text()"))
            contact[u'手机'] = safe_state(
                page.xpath(
                    "//div[@class=\"connect\"]/table/tr[6]/td[2]/text()"))
            contact[u'地址'] = safe_state(
                page.xpath(
                    "//div[@class=\"connect\"]/table/tr[7]/td[2]/text()"))
            contact[u'网址'] = safe_state(
                page.xpath(
                    "//div[@class=\"connect\"]/table/tr[8]/td[2]/text()"))
            data[u'联系方式'] = contact

            # print(json.dumps(data, encoding='utf-8', ensure_ascii=False))
            dics = json.dumps(data, encoding='utf-8', ensure_ascii=False)
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info(dics + ' saved to S3')
            return process._cache.post(url, dics)
示例#27
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str,
               '/opt/service/log/').info('process {}'.format(url))
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'list_view':
                re.compile(
                    'http://www.yt1998.com/price/nowDayPriceQ\!getPriceList.do\?pageIndex=(\d+)&pageSize=(\d+)'
                ),
                'detail_view':
                re.compile(
                    'http://www.yt1998.com/ytw/second/priceInMarket/getPriceHistory.jsp\?ycnam=(.*)&guige=(.*)&chandi=(.*)&market=(.*)'
                )
            })

    # http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex=0&pageSize=20
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout = int(timeout)
    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('label : {}'.format(label))
        if label == 'list_view':
            get_logger(batch_id, today_str, '/opt/service/log/').info(label)
            content = process._downloader.downloader_wrapper(url,
                                                             batch_id,
                                                             gap,
                                                             timeout=timeout,
                                                             encoding='utf-8',
                                                             refresh=True)
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('download ok')
            list_item = json.loads(content)
            for detail_item in list_item[u'data']:
                detail_item[u'access_time'] = datetime.utcnow().isoformat()

            total_num = int(list_item[u'total'])
            pageIndex = int(m.group(1))
            pageSize = int(m.group(2))
            if pageIndex == 0:
                for index in range(1, total_num // pageSize + 1):
                    get_logger(batch_id, today_str,
                               '/opt/service/log/').info('index:')
                    get_logger(batch_id, today_str,
                               '/opt/service/log/').info(index)
                    list_pattern = 'http://www.yt1998.com/price/nowDayPriceQ!getPriceList.do?pageIndex={}&pageSize={}'
                    list_url = list_pattern.format(index, pageSize)
                    manager.put_urls_enqueue(batch_id, [list_url])

            return process._cache.post(url,
                                       json.dumps(list_item,
                                                  ensure_ascii=False),
                                       refresh=True)
示例#28
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args,
            **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'main':
                re.compile(
                    r'http://www.sge.com.cn/xqzx/mrxq/index_(\d+).shtml'),
                'info':
                re.compile(r'http://www.sge.com.cn/xqzx/mrxq/(\d+).shtml')
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = float(max(0, float(gap) - other_batch_process_time))
    timeout = int(timeout)
    today_str = datetime.now().strftime('%Y%m%d')
    # print(url)
    # if kwargs and kwargs.get("debug"):
    #     get_logger(batch_id, today_str, '/opt/service/log/').info('start download')
    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     timeout=timeout)
    # print(content)
    if content == '':
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info(url + ' no content')
        return False

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            # print("not match")
            continue
        page = etree.HTML(content)

        if label == 'main':
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info('in list page')
            urls = page.xpath(".//ul[@id='zl_list']/li/a/@href")
            urls = [urlparse.urljoin(SITE, list_url) for list_url in urls]
            get_logger(batch_id, today_str,
                       '/opt/service/log/').info(str(urls))
            # get_logger(batch_id, today_str, '/opt/service/log/').info('||'.join(prd_links) + ' added to queue')
            manager.put_urls_enqueue(batch_id, urls)

            return True

        elif label == 'info':
            dic = {}
            date = page.xpath(".//h5[@class='con_h5']/text()")[0].split(
                u'\xa0')[0]
            header = page.xpath(
                ".//div[@id='page_con']/table/tbody/tr[1]/td//text()")
            infos = page.xpath(
                ".//div[@id='page_con']/table/tbody/tr/td[1]//text()")
            infos = [info.strip() for info in infos if info.strip()]

            idx = -1
            for index, prod in enumerate(list(infos)):
                if u"Pt9995" in prod:
                    idx = str(index + 1)
                    break
            if idx == -1:
                return True
            pt_infos = page.xpath(
                ".//div[@id='page_con']/table/tbody/tr[{}]/td//text()".format(
                    idx))

            if not pt_infos:
                get_logger(
                    batch_id, today_str,
                    '/opt/service/log/').info("No pt info on this page " + url)
                return True
            for col, value in zip(header, pt_infos):
                dic[col] = value.strip()
            dic[u'日期'] = date
            dic[u'source'] = url
            dic[u'access_time'] = datetime.utcnow().isoformat()
            data = json.dumps(dic, ensure_ascii=False)
            get_logger(batch_id, today_str, '/opt/service/log/').info(data)
            return process._cache.post(url, data)
示例#29
0
def process(url, batch_id, parameter, manager, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader',
                DownloadWrapper('s3', headers, REGION_NAME))
    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CacheS3(head + '-json-' + tail))

    if not hasattr(process, '_regs'):
        setattr(
            process, '_regs', {
                'entity':
                re.compile(
                    urlparse.urljoin(SITE,
                                     'cndbpedia/api/entity\?mention=(.+)')),
                'avp':
                re.compile(
                    urlparse.urljoin(SITE,
                                     'cndbpedia/api/entityAVP\?entity=(.+)')),
                'info':
                re.compile(
                    urlparse.urljoin(
                        SITE, 'cndbpedia/api/entityInformation\?entity=(.+)')),
                'tags':
                re.compile(
                    urlparse.urljoin(SITE,
                                     'cndbpedia/api/entityTag\?entity=(.+)')),
            })

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)

    today_str = datetime.now().strftime('%Y%m%d')

    if kwargs and kwargs.get("debug"):
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('start download')

    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     timeout=timeout,
                                                     encoding='utf-8')

    if content == '':
        return False

    if kwargs and kwargs.get("debug"):
        get_logger(batch_id, today_str,
                   '/opt/service/log/').info('start parsing url')

    for label, reg in process._regs.iteritems():
        m = reg.match(url)
        if not m:
            continue

        entity = urllib.unquote(m.group(1))
        if label == 'entity':
            urls = []
            avpair_api = urlparse.urljoin(SITE,
                                          'cndbpedia/api/entityAVP?entity={}')
            info_api = urlparse.urljoin(
                SITE, 'cndbpedia/api/entityInformation?entity={}')
            tags_api = urlparse.urljoin(SITE,
                                        'cndbpedia/api/entityTag?entity={}')
            js = json.loads(content)
            for ent in js[u'entity']:
                if isinstance(ent, unicode):
                    ent = ent.encode('utf-8')
                ent = urllib.quote(ent)
                urls.append(avpair_api.format(ent))
                urls.append(info_api.format(ent))
                urls.append(tags_api.format(ent))

            manager.put_urls_enqueue(batch_id, urls)

            return True
        else:
            data = json.dumps({entity: json.loads(content)})

            if kwargs and kwargs.get("debug"):
                get_logger(batch_id, today_str, '/opt/service/log/').info(
                    'start post {} json'.format(label))

            return process._cache.post(url, data)
示例#30
0
def process(url, batch_id, parameter, manager, other_batch_process_time, *args, **kwargs):
    today_str = datetime.now().strftime('%Y%m%d')
    get_logger(batch_id, today_str, '/opt/service/log/').info(url)
    home_page = 'http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=36&tableName=TABLE36&title=%BD%F8%BF%DA%D2%A9%C6%B7&bcId=124356651564146415214424405468'
    if not hasattr(process, '_downloader'):
        domain_name =  Downloader.url2domain(url)
        headers = {'Host': domain_name}
        setattr(process, '_downloader', DownloadWrapper(None, headers))

    if not hasattr(process,'_reg'):
        setattr(process, '_reg', {
            'detail': re.compile('http://app1.sfda.gov.cn/datasearch/face3/content.jsp\?tableId=36&tableName=TABLE36&Id=(\d+)'),
        })

    if not hasattr(process, '_cache'):
        head, tail = batch_id.split('-')
        setattr(process, '_cache', CachePeriod(batch_id, CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout= int(timeout)
    gap = max(gap - other_batch_process_time, 0)

    
    #if kwargs and kwargs.get("debug"):
    get_logger(batch_id, today_str, '/opt/service/log/').info('start download')


    data = {
        'tableId' : '36',
        'State' : '1',
        'bcId' : '124356651564146415214424405468',
        'State' : '1',
        'State' : '1',
        'tableName' : 'TABLE36',
        'State' : '1',
        'viewtitleName' : 'COLUMN361',
        'State' : '1',
        'viewsubTitleName' : 'COLUMN354,COLUMN355,COLUMN356,COLUMN823',
        'curstart':'2',
        'State' : '1',
        'State' : '1',
    }

    if url == home_page:
        #if kwargs and kwargs.get("debug"):
        page = 1
        while 1 :
            time.sleep(gap)
            get_logger(batch_id, today_str, '/opt/service/log/').info('start parsing url at page {}'.format(page))
            data['curstart'] = page
            content = process._downloader.downloader_wrapper('http://app1.sfda.gov.cn/datasearch/face3/search.jsp',
                batch_id,
                gap,
                method = 'post',
                timeout = timeout,
                refresh = True,
                data = data,
                encoding = 'utf-8'
            )
            #get_logger(batch_id, today_str, '/opt/service/log/').info(content)
            ids = re.findall(u'进口药品&Id=(\d+)', content)
            get_logger(batch_id, today_str, '/opt/service/log/').info(ids)
            if not ids:
                get_logger(batch_id, today_str, '/opt/service/log/').info('End at {} pages'.format(page))
                break
            # if page == 3:
            #     break
            get_logger(batch_id, today_str, '/opt/service/log/').info('ids : {}'.format(ids))
            url_pattern = 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=36&tableName=TABLE36&Id={}'
            urls = []
            for drug_id in ids:
                url = url_pattern.format(drug_id)
                urls.append(url)

            manager.put_urls_enqueue(batch_id, urls)
            page += 1
            get_logger(batch_id, today_str, '/opt/service/log/').info('going to page{}'.format(page))
        return True

    elif process._reg['detail'].match(url):


        content = process._downloader.downloader_wrapper(
            url,
            batch_id,
            gap,
            timeout = timeout,
            refresh = True
            )
        if content == '':
            return False

        dom = lxml.html.fromstring(content)
        table = dom.xpath('//tr')
        item = {
            'license_number':                   table[1].xpath('./td')[1].xpath('./text()'),           # [u'注册证号']
            'old_license_number':               table[2].xpath('./td')[1].xpath('./text()'),           # [u'原注册证号']
            'packaging_license_number':         table[4].xpath('./td')[1].xpath('./text()'),           # [u'分包装批准文号']
            'company_chs':                      table[5].xpath('./td')[1].xpath('./text()'),           # [u'公司名称(中文)']
            'company_eng':                      table[6].xpath('./td')[1].xpath('./text()'),           # [u'公司名称(英文)']
            'product_name_chs':                 table[11].xpath('./td')[1].xpath('./text()'),          # [u'产品名称(中文)']
            'product_name_eng':                 table[12].xpath('./td')[1].xpath('./text()'),          # [u'产品名称(英文)']
            'commodity_name_chs':               table[13].xpath('./td')[1].xpath('./text()'),          # [u'商品名(中文)']
            'commodity_name_eng':               table[14].xpath('./td')[1].xpath('./text()'),          # [u'商品名(英文)']
            'drug_form':                        table[15].xpath('./td')[1].xpath('./text()'),          # [u'剂型(中文)']
            'specification':                    table[16].xpath('./td')[1].xpath('./text()'),          # [u'规格(中文)']
            'dosage':                           table[17].xpath('./td')[1].xpath('./text()'),          # [u'包装规格(中文)']
            'manufacturer_chs':                 table[18].xpath('./td')[1].xpath('./text()'),          # [u'生产厂商(中文)']
            'manufacturer_eng':                 table[19].xpath('./td')[1].xpath('./text()'),          # [u'生产厂商(英文)']
            'manuf_address_chs':                table[20].xpath('./td')[1].xpath('./text()'),          # [u'厂商地址(中文)']
            'manuf_address_eng':                table[21].xpath('./td')[1].xpath('./text()'),          # [u'厂商地址(英文)']
            'manuf_country_chs':                table[22].xpath('./td')[1].xpath('./text()'),          # [u'厂商国家/地区(中文)']
            'manuf_country_eng':                table[23].xpath('./td')[1].xpath('./text()'),          # [u'厂商国家/地区(英文)']
            'packaging_company_name':           table[26].xpath('./td')[1].xpath('./text()'),          # [u'分包装企业名称']
            'packaging_company_address':        table[27].xpath('./td')[1].xpath('./text()'),          # [u'分包装企业地址']
            'category':                         table[31].xpath('./td')[1].xpath('./text()'),          # [u'产品类别']
            'standard_code':                    table[32].xpath('./td')[1].xpath('./text()'),          # [u'药品本位码']
            'source' :                             [url], #设为list格式与之前字段统一,在下面的循环里一并取出
        }

        for k,v in item.iteritems():
            if len(v) > 0:
                item[k] = v[0]
            else :
                item[k] = None


        return process._cache.post(url, json.dumps(item, ensure_ascii = False))