예제 #1
0
파일: crawler.py 프로젝트: xiocode/xio
def offer_to_sell():
    results = tb_category_info.select().where(pid__ne = '0').execute()
    for result in results:
        url_path = SELL_BASE_URL + result.cid + '/' + result.url
        while url_path:
            print url_path
            resp = session.get(url_path)
            url_path = next_pattern.findall(resp.text)
            if url_path:
                url_path = url_path[0]
                urls = sell_url_pattern.findall(resp.text)
                for url in urls:
                    try:
                        sell_info = {}
                        print url
                        sell_id, product_name = sell_id_pattern.findall(url)[0]
                        resp = session.get(url)
                        #                    groups = company_info_pattern.findall(resp.text)
                        company_id = company_id_pattern.findall(resp.text)[0]
                        product_description = sell_description_pattern.findall(resp.text)[0].strip()
                        sell_info['cid'] = result.cid
                        sell_info['sell_id'] = sell_id
                        sell_info['company_id'] = company_id
                        sell_info['product_name'] = product_name.strip()
                        sell_info['description'] = strip_tags(product_description).strip()
                        print sell_info
                        sell_info_db = tb_sell_info(**sell_info)
                        sell_info_db.save()
                    except Exception:
                        print traceback.format_exc()
                        with open('sell_error.txt', 'a') as FILE:
                            FILE.write(url + '\n')
                        print '出错'
예제 #2
0
파일: crawler.py 프로젝트: xiocode/xio
def products_info():
    results = tb_category_info.select().where(pid__ne = '0').execute()
    for result in results:
        url_path = PRODUCTS_BASE_URL + result.cid + '/' + result.url
        while url_path:
            print url_path
            resp = session.get(url_path)
            url_path = next_pattern.findall(resp.text)
            if url_path:
                url_path = url_path[0]
                urls = product_url_pattern.findall(resp.text)
                for url in urls:
                    try:
                        product_info = {}
                        print url
                        product_id, product_name = product_id_pattern.findall(url)[0]
                        resp = session.get(url)
    #                    groups = company_info_pattern.findall(resp.text)
                        company_id = company_id_pattern.findall(resp.text)[0]
                        product_description = product_description_pattern.findall(resp.text)[0].strip()
                        product_info['cid'] = result.cid
                        product_info['product_id'] = product_id
                        product_info['company_id'] = company_id
                        product_info['product_name'] = product_name.strip()
                        product_info['description'] = clean_html(product_description)
                        print product_info
                        product_info_db = tb_product_info(**product_info)
                        product_info_db.save()
                    except Exception:
                        print traceback.format_exc()
                        with open('product_error.txt', 'a') as FILE:
                            FILE.write(url + '\n')
                        print '出错'
예제 #3
0
파일: crawler.py 프로젝트: xiocode/xio
def company_crawler():
    results = tb_category_info.select().where(pid__ne = '0').execute()
    for result in results:
        url_path = COMPANY_BASE_URL + result.cid + '/' + result.url
        while url_path:
            print url_path
            resp = session.get(url_path)
            url_path = next_pattern.findall(resp.text)
            if url_path:
                url_path = url_path[0]
            urls = company_url_pattern.findall(resp.text)
            for url in urls:
                try:
                    company_info = {}
                    print url
                    company_id = company_id_pattern.findall(url)[0]
                    company_info['cid'] = result.cid
                    company_info['company_id'] = company_id
                    resp = session.get(url)
                    groups = company_info_pattern.findall(resp.text)
                    company_info['company_name'] = groups[0][0]
                    company_info['company_description'] = groups[0][1].strip()

                    company_profile_block = groups[0][2]
                    matchs = company_profile_pattern.findall(company_profile_block)
                    for match in matchs:
                        if 'Business' in match[0]:
                            company_info['business_type'] = match[1]
                        elif 'Industry' in match[0]:
                            company_info['industry_focus'] = match[1]
                        elif 'Services' in match[0]:
                            company_info['services_products'] = match[1]
                        elif 'Year' in match[0]:
                            company_info['year_established'] = match[1]
                        elif 'Employees' in match[0]:
                            company_info['employees'] = match[1]
                        elif 'Annual' in match[0]:
                            company_info['annual_revenue'] = match[1]
                        elif 'Geographic' in match[0]:
                            company_info['geographic_markets'] = match[1]
                        elif 'Certificates' in match[0]:
                            company_info['certificates'] = match[1]
                        elif 'Brand' in match[0]:
                            company_info['brand_name'] = match[1]
                        else:
                            print match

                    company_info['contact_person'] = groups[0][3]
                    company_info['company_address'] = groups[0][4]
                    company_info['city'] = groups[0][5]
                    company_info['province'] = groups[0][6]
                    company_info['country'] = groups[0][7]
                    company_info['zip'] = groups[0][8]
                    company_info['phone_number'] = groups[0][9]
                    company_info['fax_number'] = groups[0][10]
                    company_info['homepage'] = groups[0][11]
                    print company_info
                    company_info_db = tb_company_info(**company_info)
                    company_info_db.save()
                except Exception as e:
                    print traceback.format_exc()
                    if 'list index out of range' in e.message:
                        with open('error.txt', 'a') as FILE:
                            FILE.write(url + '\n')
                    print '出错'