Пример #1
0
def main():
    connection.close()
    print_log('[asins_spider.py] Get ASINs from stores... ')
    pool = multiprocessing.Pool(processes=2)  # 少于criticals_spider
    pool.map(get_asins_from_page, get_stores())
    pool.close()
    connection.close()
Пример #2
0
 def save_email(self):  # 为用户保存邮箱
     email = self.Content
     print_log('[raven.py] email 的样子:{}'.format(email))
     self.user.email = email
     self.user.save()
     print_log('[raven.py] email 已经储存。')
     return self.reply(
         '您的邮箱地址 {} 已经储存。请确保正确,如欲更换,只需重新回复一次正确邮箱。'.format(email))
Пример #3
0
def get_critical_from_asin(asin):
    '''
    asin指asin object,不是字符串
    '''
    def get_few_review(soup):  # 一般是1个
        # base_url 本来就是查询的差评页,所以只要底下有任何评论,都是差评,都是含有 data-hook="review"字段的
        reviews = soup.find_all('div',
                                attrs={'data-hook':
                                       'review'})  # 找不到返回 [],注意是find_all 有下划线
        return len(reviews)  # 直接返回 critical reviews 的个数 可以为0

    url = urljoin(domains[asin.country], BASE_URL.format(asin.value))
    response, proxy = utils.get_response(url)
    if type(response) != int:
        soup = BeautifulSoup(response, 'lxml')

        # 有差评才有这个
        critical_review_list = soup.find(id='cm_cr-review_list')
        if critical_review_list:  # 有差评
            try:
                raw_nums = critical_review_list.find('span',
                                                     class_='a-size-base')
                nums = raw_nums.text.replace(',', '').replace('.', '')
                nums = re.findall('\d+', nums)
            except AttributeError:
                with open('none.txt', 'a', encoding='utf-8') as f:
                    f.write(soup)
                print_log('[criticals_spider.py] none.txt captured.')
            try:
                num = max(nums)
                print(asin, 'get {} review.'.format(num))
                save_critical(num, asin)
            except ValueError:  # 有评论,但是没有差评
                print(asin, 'get 0 critical review.')
                save_critical(0, asin)
        elif 'Correios.DoNotSend' in str(soup):
            proxy.fail += 1
            proxy.set_rate()
            proxy.set_stamp()
            proxy.save()
            print(asin, 'is busted.')  # 没有用print_log
        else:  # 这种是完全没任何评论
            print(asin, 'get no review at all.')
            save_critical(0, asin)
    else:
        print('ERR CODE:', response, asin)
        if response == 404:  # 该asin已经失效
            asin.valid = False
            asin.save()
        elif response == 503:
            proxy.fail += 1
            proxy.set_rate()
            proxy.set_stamp()
            proxy.save()
Пример #4
0
def send_alerts():
    '''
    遍历用户的asin,并整合成比较结果,然后发送邮件。
    '''
    for user in User.objects.filter(subscribe=True):
        content_text, content_html = make_report(user)
        if content_html: # non empty
            if user.email:
                print_log('[criticals_spider.py] Sending alerts to {}...'.format(user.email))
                send_email(user.email, content_text, content_html, subject="矩阵数据提醒您:亚马逊店铺有新的差评,请及时处理")
                Email(address=user.email, content_html=content_html).save()
            else:
                print_log('[criticals_spider.py] {} has\'t set email.'.format(user))
Пример #5
0
def run_spiders():
    '''
    启动所有爬虫
    1. 每48h抓取一次店铺asin。
    2. 1小时抓取一次差评。未来asin太多的话,会独立一台服务器无限抓。
    '''
    while True:
        asins_spider.main()  # 自动只处理新店铺和久未更新的店铺
        Asin.objects.all().update(flag=False)
        # set_all_costs() # 每个用户判断一遍
        criticals_spider.main()
        interval = set_interval()
        print_log('[core.py] Wait for {} hour ... '.format(interval))
        time.sleep(interval * 3600 * random.uniform(0.9, 1.2))  # uncertainty
Пример #6
0
def main():
    connection.close()
    print_log('[criticals_spider.py] Get criticals from ASINs...')
    # 经过测试如果进程为10,勉强服务器还能带得动,如果再大,没有试过
    # 进程为30则直接宕机。
    pool = multiprocessing.Pool(processes=3)
    while Asin.objects.filter(
            flag=False
    ):  # as long as it's not done, it will start another poll
        pool.map(get_critical_from_asin, get_asins())
    pool.close()
    # print_log('[criticals_spider.py] Entering alert.send_alerts()...')
    alert.send_alerts()  # 每次数据全部爬完来一次
    print_log('[criticals_spider.py] All done.')
    connection.close()
Пример #7
0
def get_asins_from_page(url):
    '''
    0. 工作原理:可以同时写两个yield,一个yield内容迭代完毕后,会接着
    迭代第二个的具体而言,先输出asin,该页面asin输出完毕后,就输出下一
    页的链接外函数判断是链接,则重复调用内函数,直至无法找到下一页链接为止

    1. 每次爬取店铺。如果asin未保存过,则保存,已保存过则忽略。
    2. 如果之前保存的asin,本次没有爬取到,则不做任何处理。
    3. 因为在criticals_spider中,可能发生302或者404(未经测试)
    4. 发生404,该asin直接valid=False就不再处理
    5. 发生302可能是跳到尚有货的变体(未经测试)
    '''
    response, proxy = utils.get_response(url)
    country = utils.get_country_from_url(url)
    if type(response) != int and 'Correios.DoNotSend' not in response:
        soup = BeautifulSoup(response, 'lxml')
        asin_strs = []
        lis = soup.find_all('li')
        for li in lis:
            if li.has_attr('data-asin'):
                asin_str = li['data-asin']
                if asin_str not in asin_strs:
                    asin_strs.append(asin_str)  # 一般没有重复,但是确保万无一失,啰嗦一下 以后可能会去掉
        for asin_str in asin_strs:
            if not Asin.objects.filter(value=asin_str, country=country):
                asin = Asin(value=asin_str,
                            country=utils.get_country_from_url(url),
                            store=utils.get_store_by_url(url))
                asin.save()  # 此asin是Asin对象和前面的字符串asin不是一回事
                print_log('[asins_spider.py]', asin, 'is newly added.')
            else:
                print('[asins_spider.py]',
                      asin_str + '-' + utils.get_country_from_url(url),
                      'exists already.')
        try:
            time.sleep(5)  # 确保万无一失,毕竟这个的实际频率很低
            next_page = urljoin(url, soup.find(id='pagnNextLink')['href'])
            get_asins_from_page(next_page)  # 自我迭代
        except TypeError:  # 最后一页
            print_log('[asins_spider.py]', url, 'is done.')
    else:
        print('[asins_spider.py]', 'Busted or other error!')
        proxy.fail += 1
        proxy.set_rate()
        proxy.set_stamp()
        proxy.save()
        get_asins_from_page(url)  # 重新爬一次(换了个代理)
Пример #8
0
 def save_store(self):  # 保存店铺链接
     url = self.Content
     print_log('[raven.py] URL 的样子:{}'.format(url))
     me = utils.get_me(url)
     print_log('[raven.py] me 的样子:{}'.format(me))
     if not Store.objects.filter(user=self.user, url=url):  # 重复提交的不保存
         store = Store(user=self.user, url=url, me=me,
                       last_update=0)  # 此时的store.last_update=0
         store.save()
         print_log('[raven.py] 店铺已经保存。')
         return self.reply(
             '您提交的店铺已经保存。回复「店铺」即可查看已经保存的全部店铺。回复「邮箱」可以进一步查看邮箱设置。')
Пример #9
0
def is_store_url(url):
    if url.startswith('https'):
        if 'marketplaceID' in url:
            if 'merchant' in url:
                print_log('[raven.py] 捕获到了店铺。')
                return True
Пример #10
0
 def save_msg(self):
     print_log(str(self.dict))
     return self.reply('您提交的图片我们已收到。我们会尽快进行人工审核。')