Exemplo n.º 1
0
class UrlManager(object):
    def __init__(self):
        self.data_manager = DataManager()
    #拼接公司黄页的链接
    def tel_url(self):
        memberId_list, shop_url = self.data_manager.read_local()
        url_list = []
        shopurl1_list = []
        shopurl2_list = []
        for memberId in memberId_list:
            url = 'https://corp.1688.com/page/index.htm?memberId='+str(memberId)+'&fromSite=company_site&tab=companyWeb_contact'
            url_list.append(url)
        for shopurl in shop_url:
            url1 = shopurl + '/page/merchants.htm'
            url2 = shopurl + '/page/contactinfo.htm??smToken=d6f92a6aadd34fa3aef88809a6d9f7d1&smSign=ADUiGA9MZ4pScu4JQD9FWg%3D%3D'
            shopurl1_list.append(url1)
            shopurl2_list.append(url2)
        return url_list,shopurl1_list,shopurl2_list
    #从本地获取已经爬取过的url,不要再爬第二次
    # 从本地读取数据
    def crawred_url(self):
        crawred_url = []
        try:
            with open('D:\\data\\1688factory_tel.csv') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    crawred_url.append(row['url'])
        except Exception,e:
            print '未发现电话记录保存文件'
        return crawred_url
Exemplo n.º 2
0
def run():
    #数据爬取器
    factory_spider = Spiser()
    #数据解析器
    json_parser = dataParser.DataParser()
    data_manager = DataManager()
    # #总页数:250页
    # total_page = 251
    # #获取数据
    # for i in range(total_page):
    #     print i
    #     pagedata = factory_spider.get_pageData(i)
    #     if pagedata is not None:
    #         factory_list = json_parser.json_parser(pagedata)
    #         data_manager.save_local(factory_list)
    #     time.sleep(1.5)
    memberId_list = data_manager.read_local()
    for i in memberId_list:
        print i