Пример #1
0
 def get_collection(self):
     name = config.USERNAME
     sql = "select * from collection where(name='%s') order by id DESC limit 5" % (
         name)
     mysqlClient = MysqlClient()
     find_res = mysqlClient.find_all(sql)
     return find_res
Пример #2
0
 def __init__(self, master=None):
     self.root = master  # 定义内部变量root
     self.root.geometry('%dx%d' % (300, 180))  # 设置窗口大小
     self.username = StringVar()
     self.password = StringVar()
     self.createPage()
     self.mysqlClient = MysqlClient()
Пример #3
0
class Getter():
    def __init__(self):
        self.Mysql = MysqlClient()
        self.crawler = Crawler()
    
    def is_over_threshold(self):
        """
        Determine whether the agent pool limit has been reached
        """
        if self.Mysql.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False
    
    def run(self):
        print('Get the execution')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # Get an agent
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    if (self.Mysql.exists(proxy)):
                        pass
                    else:
                        print(proxy)
                        self.Mysql.add(proxy)
Пример #4
0
 def get_history(self):
     name = config.USERNAME
     sql = "select * from content where(name='%s') order by id DESC limit 10" % (
         name)
     mysqlClient = MysqlClient()
     find_res = mysqlClient.find_all(sql)
     return find_res
Пример #5
0
class RegPage(object):
    def __init__(self, master=None):
        self.root = master  # 定义内部变量root
        self.root.geometry('%dx%d' % (300, 200))  # 设置窗口大小
        self.username = StringVar()
        self.password = StringVar()
        self.repassword = StringVar()
        self.createPage()
        self.mysqlClient = MysqlClient()

    def createPage(self):
        self.page = Frame(self.root)  # 创建Frame
        self.page.pack()
        Label(self.page).grid(row=0, stick=W)
        Label(self.page, text='账户: ').grid(row=1, stick=W, pady=10)
        Entry(self.page, textvariable=self.username).grid(row=1,
                                                          column=1,
                                                          stick=E)
        Label(self.page, text='密码: ').grid(row=2, stick=W, pady=10)
        Entry(self.page, textvariable=self.password, show='*').grid(row=2,
                                                                    column=1,
                                                                    stick=E)
        Label(self.page, text='确认密码: ').grid(row=3, stick=W, pady=10)
        Entry(self.page, textvariable=self.repassword, show='*').grid(row=3,
                                                                      column=1,
                                                                      stick=E)
        # Button(self.page, text='退出', command=self.page.quit).grid(row=3, column=1, stick=E)
        Button(self.page, text='注册', command=self.register).grid(row=4,
                                                                 column=1,
                                                                 stick=E)

    def register(self):
        name = self.username.get()
        password = self.password.get()
        repassword = self.repassword.get()

        if name.strip() == '' or password.strip() == '' or repassword.strip(
        ) == '':
            showinfo(title='注册失败', message='账户名或者密码不能为空')
            return

        if password == repassword:
            sql = "select * from user where(name='%s')" % (name)
            find_res = self.mysqlClient.find_one(sql)
            if find_res:
                showinfo(title='错误', message='该用户已存在')
            else:
                sql = "insert into user(name,password) values ('%s','%s')" % (
                    name, password)
                add_res = self.mysqlClient.save(sql)
                if add_res:
                    showinfo(title='注册成功', message='注册成功')
                    self.page.destroy()

                    login = LoginPage.LoginPage(self.root)
                else:
                    showinfo(title='注册失败', message='注册失败')
        else:
            showinfo(title='错误', message='两次输入的密码不一致')
Пример #6
0
 def __init__(self, master=None):
     Frame.__init__(self, master)
     self.root = master  # 定义内部变量root
     self.word = StringVar()
     self.mean = StringVar()
     self.createPage()
     self.mysqlClient = MysqlClient()
     self.spider = spider.Spider()
Пример #7
0
class CookiesGenerator(object):
    def __init__(self, website='default'):
        self.website = website
        self.mysql_client = MysqlClient(website)

    def new_cookies(self, username, password):
        """
		新生成Cookies,子类需要重写
		:param username: 用户名
		:param password: 密码
		:return:
		"""
        raise NotImplementedError

    def process_cookies(self, cookies):
        '''
		处理cookies
		:param cookies:
		:return:
		'''
        dict = {}
        for k, v in cookies.items():
            dict[k] = v

        print(dict)
        return dict

    def run(self):
        """
		运行, 得到所有账户, 然后顺次模拟登录
		:return:
		"""
        account_list = self.mysql_client.get_all()
        # print(account_list)

        for account in account_list:
            if account['valid'] == 0:
                print('正在生成Cookies...', '账号', account['username'], '密码',
                      account['password'])
                result = self.new_cookies(account['username'],
                                          account['password'])
                # 成功获取
                if result.get('status') == 1:
                    cookies = self.process_cookies(result.get('content'))
                    print('成功获取到Cookies', cookies)
                    self.mysql_client.update_cookies_by_username(
                        account['username'], json.dumps(cookies))
                #密码错误,移除账号
                elif result.get('status') == 2:
                    print(result.get('content'))
                    if self.mysql_client.delete_account(account['username']):
                        print('成功删除账号')
                else:
                    print(result.get('content'))

        print('所有账号都已经成功获取Cookies')
Пример #8
0
 def add_detail_infos(self):
     conn = MysqlClient()
     results = conn.random()
     for result in results:
         response = requests.get(result[2])
         if response.status_code == 200:
             with open('1'+'.'+'jpg','wb') as f:
                 print('正在封面图片%s'%result[2])
                 f.write(response.content)
             self.fill_infos(result)
Пример #9
0
class CountFrame(Frame):  # 继承Frame类
    def __init__(self, master=None):
        Frame.__init__(self, master)
        self.root = master  # 定义内部变量root
        self.mysqlClient = MysqlClient()
        self.message = StringVar()
        # self.createPage()

    def createPage(self):
        Label(self).grid(row=0, stick=W, pady=10)
        Label(self, text='请输入: ').grid(row=1, stick=W, pady=10)
        Entry(self, textvariable=self.message, width=40).grid(row=2, stick=W)

        Button(self, text='发布', command=self.set_message).grid(row=10,
                                                               column=2,
                                                               stick=E,
                                                               pady=10)

    def set_message(self):
        name = config.USERNAME
        message = self.message.get()
        publishDateStr = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        publishDate = int(time.time())
        sql = "insert into content(name,message,publishDateStr,publishDate) values ('%s','%s','%s','%s')" % (
            name, message, publishDateStr, publishDate)
        save_res = self.mysqlClient.save(sql)
        if save_res:
            showinfo(title='成功', message='发布成功')
        else:
            showinfo(title='失败', message='发布失败')

        return save_res
Пример #10
0
class Tester(object):
    def __init__(self):
        self.mysql = MysqlClient()

    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.mysql.max(proxy)
                        print('代理可用', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.mysql.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        print('开始测试')
        try:
            proxies = self.mysql.all()
            loop = asyncio.get_event_loop()
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                test_proxies = proxies[i:i+BATCH_TEST_SIZE]
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
                asyncio.open_connection()
        except Exception as e:
            print('测试发生错误', e.args)
        self.mysql.close()
Пример #11
0
class Tester(object):
    def __init__(self):
        self.Mysql = MysqlClient()
    
    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                #print(proxy)
                #print(proxy[0],proxy[1],proxy[2])
                
                real_proxy ="http://{0}:{1}".format(proxy[0],proxy[1])
                print(real_proxy)
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.Mysql.max_(proxy)
                        print('代理可用', proxy)
                    else:
                        self.Mysql.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.Mysql.decrease(proxy)
                print('代理请求失败', proxy)
    
    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.Mysql.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies =list(self.Mysql.batch(start,stop))
                #print(test_proxies,type(test_proxies))
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #12
0
    def add_detail_infos(self):
        conn = MysqlClient()
        results = conn.randombigmoney()

        nums = 0
        for result in results:
            response = requests.get(result[2])
            if response.status_code == 200:
                with open('1' + '.' + 'jpg', 'wb') as f:
                    print('下载封面图片%s' % result[2])
                    f.write(response.content)

            self.fill_infos(result)
            nums += 1
            print('*' * 50)
            print('*' * 50)
            print('添加最新口子%s' % nums)
            print('*' * 50)
            print('*' * 50)
Пример #13
0
class Getter():
    def __init__(self):
        self.mysql = MysqlClient()
        self.spider = Spider()

    def is_over_max(self):
        if self.mysql.count() >= MAX_POOL_COUNT:
            return True
        else:
            return False

    def run(self):
        print('爬虫程序开始执行')
        if not self.is_over_max():
            for callback_lable in range(self.spider.__SpiderFuncCount__):
                callback = self.spider.__SpiderFunc__[callback_lable]
                proxies = self.spider.get_proxies(callback)
                for proxy in proxies:
                    self.mysql.add(proxy)
        self.mysql.close()
Пример #14
0
 def run(self):
     print('代理池开始运行')
     mysql=MysqlClient()
     if TESTER_ENABLED:
         tester_process = Process(target=self.schedule_tester)        
         tester_process.start()
     if GETTER_ENABLED:
         getter_process = Process(target=self.schedule_getter)
         getter_process.start()      
     if API_ENABLED:
         api_process = Process(target=self.schedule_api)
         api_process.start()
Пример #15
0
class ValidTester(object):
    def __init__(self, website='default'):
        self.website = website
        self.mysql_client = MysqlClient(website)

    def test(self, username, cookies):
        raise NotImplementedError

    def run(self):
        account_list = self.mysql_client.get_all()
        for account in account_list:
            self.test(account['username'], account['cookies'])
Пример #16
0
 def run(self):
     print('Agent pool starts running')
     logger.log('INFOR', 'Scheduler starts running...')
     mysql=MysqlClient()
     if TESTER_ENABLED:
         tester_process = Process(target=self.schedule_tester)
         tester_process.start()
     if GETTER_ENABLED:
         getter_process = Process(target=self.schedule_getter)
         getter_process.start()
     if API_ENABLED:
         api_process = Process(target=self.schedule_api)
         api_process.start()
Пример #17
0
class LoginPage(object):
    def __init__(self, master=None):
        self.root = master  # 定义内部变量root
        self.root.geometry('%dx%d' % (300, 180))  # 设置窗口大小
        self.username = StringVar()
        self.password = StringVar()
        self.createPage()
        self.mysqlClient = MysqlClient()

    def createPage(self):
        self.page = Frame(self.root)  # 创建Frame
        self.page.pack()
        Label(self.page).grid(row=0, stick=W)
        Label(self.page, text='账户: ').grid(row=1, stick=W, pady=10)
        Entry(self.page, textvariable=self.username).grid(row=1,
                                                          column=1,
                                                          stick=E)
        Label(self.page, text='密码: ').grid(row=2, stick=W, pady=10)
        Entry(self.page, textvariable=self.password, show='*').grid(row=2,
                                                                    column=1,
                                                                    stick=E)
        Button(self.page, text='登陆', command=self.loginCheck).grid(row=3,
                                                                   stick=W,
                                                                   pady=10)
        # Button(self.page, text='退出', command=self.page.quit).grid(row=3, column=1, stick=E)
        Button(self.page, text='注册', command=self.register).grid(row=3,
                                                                 column=1,
                                                                 stick=E)

    def loginCheck(self):
        name = self.username.get()
        secret = self.password.get()
        sql = "select * from user where(name='%s' and password='******')" % (
            name, secret)
        find_res = self.mysqlClient.find_one(sql)
        if find_res:
            self.page.destroy()
            MainPage(self.root)
            config.USERNAME = find_res[1]
        else:
            showinfo(title='错误', message='账号或密码错误!')

    def register(self):
        self.page.destroy()
        RegPage(self.root)
Пример #18
0
 def __init__(self):
     self.Mysql = MysqlClient()
Пример #19
0
 def save_to_mysql(self, query, title, url):
     m = MysqlClient()
     m.add(query, title, url)
Пример #20
0
 def save_to_mysql(self,query,title,url):
     m = MysqlClient()
     m.add(query,title,url)
Пример #21
0
 def __init__(self):
     self.download = Download()
     self.db = MysqlClient()
     self.redisClient = RedisClient()
Пример #22
0
class Scheduler(object):
    def __init__(self):
        self.download = Download()
        self.db = MysqlClient()
        self.redisClient = RedisClient()

    def run(self):
        #self.get_qu()
        #self.get_zhen()
        # self.push_url_to_redis()
        self.get_position()

    def get_qu(self):
        sql = 'select * from shi'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[2]
            url = SHI_URL.format(shi_id='c' + shi_id)
            print(url)
            html = self.download.get_html(url)
            if html.status_code == 200 and html is not None:
                html = HTML(html.text)
                qu_id_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/@href'
                )
                qu_name_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/text()'
                )
                for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]):
                    qu_id = qu_id.split('/')
                    qu_id = qu_id[2]
                    sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_zhen(self):
        sql = 'select * from qu'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[1]
            qu_id = res[2]
            url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id)
            print(url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)
                zhen_id_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/@href'
                )
                zhen_name_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/text()'
                )
                for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]):
                    zhen_id = zhen_id.split('/')
                    zhen_id = zhen_id[2]
                    sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_position(self):
        redis_results = self.redisClient.pop('employment')
        try:
            json_obj = json.loads(redis_results[1].decode('utf8'))
        except:
            return None

        if json_obj:
            flag = True
            pageToken = 1

            #处理翻页问题
            while flag:
                detail_url_list = []
                url = json_obj['url']
                pre_page = re.search('\/\?page=(.*?)&', url).group(1)
                if int(pageToken) > 10:
                    break
                url = url.replace(
                    'page=' + pre_page + '&sort=2&ka=page-' + pre_page,
                    'page=' + str(pageToken) + '&sort=2&ka=page-' +
                    str(pageToken))
                cityId = json_obj['cityId']
                zhiweiId = json_obj['zhiweiId']
                print(url)
                html = self.download.get_html(url)

                if html is not None and html.status_code == 200:
                    html = HTML(html.text)

                    #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时)
                    li_xpath = html.xpath('//div[@class="job-list"]/ul/li')
                    for li in li_xpath:
                        content = etree.tostring(li)
                        content = HT.unescape(content.decode())
                        content = HTML(content)
                        li_time = content.xpath(
                            'string(//div[@class="info-publis"]/p)')
                        href_url = content.xpath(
                            'string(//div[@class="info-primary"]//h3/a/@href)')
                        try:
                            last_str = li_time.split('发布于')[1]
                            minute = last_str.split(':')[1]
                            #判断是否当天发布
                            if minute:
                                #判断数据库存不存在:
                                try:
                                    cid = re.match('^/job_detail/(.*?)\.html',
                                                   href_url).group(1)
                                    sql = "select * from positions where cid='%s'" % (
                                        cid)
                                    find_one_res = self.db.find_one(sql)
                                    if find_one_res is None:
                                        #先把cid插入,避免重复抓取
                                        sql = "insert into positions(cid) values ('%s')" % (
                                            cid)
                                        self.db.save(sql)
                                        detail_url_list.append(
                                            config.HOST_URL + href_url)
                                    elif find_one_res[2] is None:
                                        detail_url_list.append(
                                            config.HOST_URL + href_url)
                                    else:
                                        print('数据库存在该记录:' + str(cid))
                                except:
                                    print('查询数据库出错:' + str(cid))
                        except:
                            print('该URL发布日期小于当天:' + config.HOST_URL + href_url)

                    results = self.get_detail(detail_url_list, cityId,
                                              zhiweiId)

                    #判断是否翻页
                    try:
                        last_li = html.xpath(
                            'string(//div[@class="job-list"]/ul/li[last()]//div[@class="info-publis"]/p)'
                        )
                        last_str = last_li.split('发布于')[1]
                        minute = last_str.split(':')[1]
                        if minute:
                            pageToken = str(int(pageToken) + 1)
                    except:
                        flag = False

                else:
                    print('该url无数据')

    def get_detail(self, detail_url_list, cityId, zhiweiId):
        for url in detail_url_list:
            print('下载该详情页:' + url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)

                try:
                    cid = re.match(
                        '^https://www.zhipin.com/job_detail/(.*?)\.html',
                        url).group(1)
                except:
                    print('获取cid失败')
                    continue

                title = html.xpath('string(//h1)')
                url = url
                try:
                    publishDateStr = html.xpath(
                        'string(//span[@class="time"])').split('发布于')[1]
                    publishDate = int(
                        time.mktime(
                            time.strptime(publishDateStr, "%Y-%m-%d %H:%M")))
                except:
                    publishDateStr = None
                    publishDate = None

                try:
                    info = html.xpath(
                        'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)'
                    )
                    info = info.split(':')
                    city = info[1][:-2]
                    jingyan = info[2][:-2]
                    xueli = info[3]
                except:
                    city = None
                    jingyan = None
                    xueli = None
                price = html.xpath(
                    'string(//div[@class="info-primary"]//span[@class="badge"])'
                )
                posterName = html.xpath('string(//h2)')
                posterId = None
                posterUrl = html.xpath(
                    'string(//div[@class="detail-figure"]/img/@src)')
                content = html.xpath(
                    'string(//div[@class="job-sec"]/div[@class="text"])'
                ).strip()

                try:
                    company_text = html.xpath(
                        'string(//a[@ka="job-cominfo"]/@href)')
                    companyID = re.match('/gongsi/(.*?)\.html',
                                         company_text).group(1)
                except:
                    companyID = None
                createDate = int(time.time())

                #判断是否是当天发布
                temp_time = time.localtime(int(time.time()))
                now_DateStr = time.strftime("%Y-%m-%d", temp_time)
                lt = time.strptime(now_DateStr, "%Y-%m-%d")
                now_timestamp = int(time.mktime(lt))
                if publishDate == None or publishDate < now_timestamp or publishDate >= (
                        now_timestamp + 86400):
                    print('特例.该url不是当天发布:' + str(url))
                    continue

                res_obj = {
                    'cid': cid,
                    'title': title,
                    'url': url,
                    'publishDateStr': publishDateStr,
                    'publishDate': publishDate,
                    'city': city,
                    'jingyan': jingyan,
                    'xueli': xueli,
                    'price': price,
                    'posterName': posterName,
                    'posterId': posterId,
                    'posterUrl': posterUrl,
                    'content': content,
                    'companyID': companyID,
                    'createDate': createDate,
                    'cityId': cityId,
                    'zhiweiId': zhiweiId
                }
                print(res_obj)
                sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \
                      " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                      % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\
                      + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \
                      %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)
                self.db.save(sql)
            else:
                print('请求详情页失败:' + str(url))

    def push_url_to_redis(self):
        # zhiwei_list = []
        # zhiwei_sql = 'select * from zhiwei'
        # zhiwei_results = self.db.find_all(zhiwei_sql)
        # for zhiwei in zhiwei_results:
        #     zhiwei_list.append(zhiwei[2])
        #
        # zhen_sql = 'select * from zhen'
        # zhen_results = self.db.find_all(zhen_sql)
        #
        # for res in zhen_results:
        #     pid = res[1]
        #     zhen_id = res[2]
        #     for zhiwei_id in zhiwei_list:
        #         url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1')
        #         self.redisClient.push('employment',url)

        zhiwei_list = []
        zhiwei_sql = 'select * from zhiwei'
        zhiwei_results = self.db.find_all(zhiwei_sql)
        for zhiwei in zhiwei_results:
            zhiwei_list.append(zhiwei[2])

        shi_sql = 'select * from shi'
        shi_results = self.db.find_all(shi_sql)

        for res in shi_results:
            pid = res[2]
            for zhiwei_id in zhiwei_list:
                url = NEW_POSITION_URL.format(pid=pid,
                                              zhiwei_id=zhiwei_id,
                                              pageToken='1')
                url_obj = {"url": url, "cityId": pid, "zhiweiId": zhiwei_id}
                self.redisClient.push('employment', json.dumps(url_obj))
Пример #23
0
class Scheduler(object):
    def __init__(self):
        self.download = Download()
        self.db = MysqlClient()
        # self.redisClient = RedisClient()

    def run(self):
        bestseller = get_bestseller.Bestseller()
        bestseller.start()
        # for i in range(1,11):
        #     self.get_kw('apple',str(i))

    def get_kw(self, kw, page):
        url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords={kw}&page={page}'.format(
            kw=kw, page=page)
        print(url)
        response = self.download.get_html(url)
        if response is not None:
            html = HTML(response.text)
            # titles = html.xpath('//div[@class="a-row a-spacing-small"]//a/h2/text()')
            urls = html.xpath('//div[@class="a-row a-spacing-small"]//a/@href')
            for url in urls:
                if url[:3] == '/gp':
                    url = 'https://www.amazon.com' + url
                detail_response = self.download.get_html(url)
                try:
                    url = re.search('<link rel="canonical" href="(.*?)"',
                                    detail_response.text).group(1)
                except:
                    url = url

                detail_html = HTML(detail_response.text)
                product_id = hashlib.md5(url.encode()).hexdigest()
                title = detail_html.xpath('string(//h1[@id="title"])').strip()
                price = detail_html.xpath(
                    'string(//span[@id="priceblock_ourprice"])').replace(
                        ',', '').replace('$', '')
                if price == '':
                    price = 0
                color = detail_html.xpath(
                    'string(//div[@id="variation_color_name"]//span)').strip()
                size = detail_html.xpath(
                    'string(//div[@id="variation_size_name"]//span)').strip()
                commentCount = detail_html.xpath(
                    'string(//span[@id="acrCustomerReviewText"])').split(
                        ' ')[0].replace(',', '')
                if commentCount == '':
                    commentCount = 0
                commentRating = detail_html.xpath(
                    'string(//a[@class="a-popover-trigger a-declarative"]/i/span)'
                ).split(' ')[0]
                if commentRating == '':
                    commentRating = 0
                crawled_timestamp = int(time.time())
                crawled_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime())
                crawled_date = time.strftime("%Y-%m-%d", time.localtime())
                keywordtype = kw

                #编号
                try:
                    asin = re.search(
                        '.*?productDetails_detailBullets_sections1.*?ASIN.*?<td class="a-size-base">(.*?)</td>',
                        detail_response.text, re.S).group(1).strip()
                except:
                    asin = None
                #类目排名
                try:
                    category_res1 = re.search(
                        '.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?(<span>.*?</span>)',
                        detail_response.text, re.S)
                    category_res2 = re.search(
                        '.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?<span>.*?</span>.*?(<span>.*?</span>).*?</span>',
                        detail_response.text, re.S)
                    if category_res1:
                        # rank_search = re.search('.*?#(.*?)in.*?', category_res1.group(1))
                        # if rank_search:
                        #     rank1 = rank_search.group(1)
                        # else:
                        #     rank1 = None
                        # print(rank1)
                        html = HTML(category_res1.group(1))
                        list_res = html.xpath('//text()')
                        rank1 = ''.join(list_res)
                    if category_res2:
                        html = HTML(category_res2.group(1))
                        list_res = html.xpath('//text()')
                        rank2 = ''.join(list_res)
                except:
                    rank1 = None
                    rank2 = None

                #图片信息入库
                try:
                    imageUrls = []
                    img_res = re.search(
                        "var data = {};.*?var obj = jQuery.parseJSON\('(.*?)'\);",
                        detail_response.text, re.S)
                    img_obj = json.loads(img_res.group(1))
                    key_one = list(img_obj['colorImages'].keys())[0]
                    for data in img_obj['colorImages'][key_one]:
                        imageUrls.append(data['large'])

                    for img in imageUrls:
                        img_id = hashlib.md5(img.encode()).hexdigest()
                        img_url = img
                        sql = "insert into image(product_id,img_id,img_url,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s')" \
                              % (product_id,img_id,img_url,crawled_timestamp,crawled_time) \
                              + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time)
                        print(sql)
                        self.db.save(sql)
                except:
                    pass

                #跟卖信息入库
                have_follow_sale = '0'
                follow_sale_num = 0
                follow_sale_str = detail_html.xpath(
                    'string(//div[@id="olp_feature_div"]/div/span)')
                if follow_sale_str != '':
                    have_follow_sale = '1'
                    follow_sale_num = re.search('\((\d+)\)',
                                                follow_sale_str).group(1)

                follow_sale_url = detail_html.xpath(
                    'string(//div[@id="olp_feature_div"]/div/span/a/@href)')
                if follow_sale_url[0:4] == 'http':
                    follow_sale_url = follow_sale_url
                else:
                    follow_sale_url = 'https://www.amazon.com' + follow_sale_url + '&startIndex={startIndex}'
                follow_response = self.get_follow_sale(follow_sale_url,
                                                       follow_sale_num)
                for item in follow_response:
                    follow_sale_id = item['follow_sale_id']
                    price = item['price']
                    seller = item['seller']
                    type = item['type']
                    sql = "insert into follow_sale(product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s','%s','%s')" \
                          % (product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) \
                          + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time)
                    print(sql)
                    self.db.save(sql)

                #商品信息入库
                obj = {
                    'product_id': product_id,
                    'title': title,
                    'url': url,
                    'price': price,
                    'color': color,
                    'size': size,
                    'commentCount': commentCount,
                    'commentRating': commentRating,
                    # 'imageUrls': imageUrls,
                    'crawled_timestamp': crawled_timestamp,
                    'crawled_time': crawled_time,
                    'have_follow_sale': have_follow_sale,
                    'follow_sale_num': follow_sale_num,
                }
                print(obj)
                sql = "insert into keyword_res(product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date,keywordtype) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"\
                      % (product_id, title, url, price, color, size, commentCount, commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2, crawled_timestamp, crawled_time,crawled_date,keywordtype)\
                      + "ON DUPLICATE KEY UPDATE title='%s', url='%s', price='%s',commentCount='%s',crawled_timestamp='%s',crawled_time='%s',crawled_date='%s'"%(title,url,price,commentCount,crawled_timestamp,crawled_time,crawled_date)
                print(sql)
                self.db.save(sql)

    def get_follow_sale(self, url, follow_sale_num):
        if follow_sale_num == 0:
            return []
        if int(follow_sale_num) > 10:
            pageNum = math.ceil(int(follow_sale_num) / 10)
        else:
            pageNum = 1

        item_list = []
        for page in range(0, pageNum):
            startIndex = page * 10
            url = url.format(startIndex=startIndex)
            print(url)
            follow_response = self.download.get_html(url)
            if follow_response is None:
                return []
            follow_html = HTML(follow_response.text)

            html_list = follow_html.xpath(
                '//div[@class="a-row a-spacing-mini olpOffer"]')
            for html in html_list:
                html = etree.tostring(html).decode()
                html = HTML(html)
                price = html.xpath(
                    'string(//div[@class="a-column a-span2 olpPriceColumn"]/span)'
                ).strip().replace('$', '')
                seller = html.xpath('string(//h3/span)').strip()
                FBA = html.xpath('string(//div[@class="olpBadge"])')
                type = 'FBM'
                if FBA != '':
                    type = 'FBA'
                follow_sale_id = hashlib.md5(
                    (seller + price + type).encode()).hexdigest()
                obj = {
                    'follow_sale_id': follow_sale_id,
                    'price': price,
                    'seller': seller,
                    'type': type
                }
                print(obj)
                item_list.append(obj)
        return item_list
Пример #24
0
 def __init__(self, master=None):
     Frame.__init__(self, master)
     self.root = master  # 定义内部变量root
     self.mysqlClient = MysqlClient()
Пример #25
0
class Scheduler(object):
    def __init__(self):
        self.download = Download()
        self.db = MysqlClient()

    def run(self):
        self.get_books()

    def get_books(self):
        kw = input('请输入要查找的书籍(例如:python编程):')
        host_url = 'http://search.dangdang.com/?key={kw}&act=input&page_index={page}'

        #删除原有数据
        sql = 'delete from books'
        self.db.save(sql)


        for i in range(1, 10):
            print('当前页:'+str(i))
            start_url = host_url.format(kw=kw, page=i)
            print(start_url)
            response = self.download.get_html(start_url)
            response.encoding = 'gbk'
            # print(response.text)
            html = HTML(response.text)


            item_xpath_list = html.xpath('//div[@id="search_nature_rg"]/ul/li')

            for item in item_xpath_list:
                url = item.xpath('string(.//a[@name="itemlist-title"]/@href)')
                bookId = re.search('http://product.dangdang.com/(\d+).html',url)
                if bookId:
                    bookId = bookId.group(1)
                else:
                    bookId = ''
                title = item.xpath('string(.//a[@name="itemlist-title"]/@title)').strip()
                now_price = item.xpath('string(.//span[@class="search_now_price"]/text())').replace('¥','')
                old_price = item.xpath('string(.//span[@class="search_pre_price"]/text())').replace('¥','')
                discount = item.xpath('string(.//span[@class="search_discount"]/text())').replace('(','').replace(')','').replace('折','').strip()
                commentCount = item.xpath('string(.//a[@class="search_comment_num"]/text())').replace('条评论','')

                author = item.xpath('string(.//p[@class="search_book_author"]/span[1]/a/@title)')
                publishDateStr = item.xpath('string(.//p[@class="search_book_author"]/span[2]/text())').replace('/','').strip()
                publishing = item.xpath('string(.//p[@class="search_book_author"]/span[3]/a/text())')

                # print(url)
                # print(title)
                # print(now_price)
                # print(old_price)
                # print(discount)
                # print(commentCount)
                # print(author)
                # print(publishDateStr)
                # print(publishing)

                sql = "insert into books(bookId,url,title,now_price,old_price,discount,commentCount,publishDateStr,author,publishing)" \
                      " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                      % (bookId,url,title,now_price,old_price,discount,commentCount,publishDateStr,author,publishing) \
                      + "ON DUPLICATE KEY UPDATE title='%s'" % (title)
                print(sql)
                self.db.save(sql)
Пример #26
0
class InputFrame(Frame):  # 继承Frame类
    def __init__(self, master=None):
        Frame.__init__(self, master)
        self.root = master  # 定义内部变量root
        self.word = StringVar()
        self.mean = StringVar()
        self.createPage()
        self.mysqlClient = MysqlClient()
        self.spider = spider.Spider()

    def createPage(self,
                   query_res={
                       'fanyi': '',
                       'phonetic': '',
                       'translation': ''
                   }):
        Label(self).grid(row=0, stick=W, pady=10)
        Label(self, text='请输入: ').grid(row=1, stick=W, pady=10)
        Entry(self, textvariable=self.word, width=40).grid(row=2, stick=W)

        Label(self, text='结果如下: ').grid(row=4, stick=W, pady=10)
        Label(self,
              text=query_res['fanyi'],
              height=2,
              width=40,
              justify='left').grid(row=5, stick=W, pady=10)
        Label(self,
              text=query_res['phonetic'],
              height=2,
              width=40,
              justify='left').grid(row=6, stick=W, pady=10)
        Label(self,
              text=query_res['translation'],
              height=2,
              width=40,
              justify='left').grid(row=7, stick=W, pady=10)
        # Entry(self, textvariable=self.mean).grid(row=3, column=1, stick=E)
        # Text(self, height=5, width=50).grid(row=4, column=1, stick=W)

        Button(self, text='查询', command=self.query_word).grid(row=10,
                                                              column=1,
                                                              stick=E,
                                                              pady=10)
        Button(self, text='收藏', command=self.collect_word).grid(row=10,
                                                                column=2,
                                                                stick=E,
                                                                pady=10)

    def query_word(self):
        #插入到历史查询记录
        word = self.word.get()

        eng = '1'
        # 是中文
        if '\u4e00' <= word[0] <= '\u9fff':
            eng = '0'
        query_res = self.spider.get_dic(word, eng=eng)
        self.createPage(query_res)

        #添加到历史查询记录
        name = config.USERNAME
        mean = json.dumps(query_res['fanyi'])
        mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        timestamp = int(time.time())
        sql = "insert into history(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % (
            word, mean, name, mytime, timestamp)
        print(sql)
        self.mysqlClient.save(sql)

    def collect_word(self):
        word = self.word.get()

        eng = '1'
        # 是中文
        if '\u4e00' <= word[0] <= '\u9fff':
            eng = '0'
        query_res = self.spider.get_dic(word, eng=eng)
        self.createPage(query_res)

        # 添加到我的收藏
        name = config.USERNAME
        mean = json.dumps(query_res)
        mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        timestamp = int(time.time())
        sql = "insert into collection(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % (
            word, mean, name, mytime, timestamp)
        save_res = self.mysqlClient.save(sql)
        if save_res:
            showinfo(title='成功', message='收藏成功')
        else:
            showinfo(title='失败', message='收藏失败!')
Пример #27
0
def get_conn():
    g_Mysql = MysqlClient()
    return g_Mysql
Пример #28
0
 def __init__(self):
     self.mysql = MysqlClient()
     self.spider = Spider()
Пример #29
0
class InputFrame(Frame):  # 继承Frame类
    def __init__(self, master=None):
        Frame.__init__(self, master)
        self.root = master  # 定义内部变量root
        self.word = StringVar()
        self.mean = StringVar()
        self.createPage()
        self.mysqlClient = MysqlClient()
        self.spider = spider.Spider()

    def createPage(self,
                   query_res={
                       'fanyi': '',
                       'phonetic': '',
                       'translation': ''
                   }):

        # self.fm2 = Frame(self.root)
        # Button(self, text='Left').pack(side=LEFT)
        # Button(self, text='This is the Center button').pack(side=LEFT)
        # Button(self, text='Right').pack(side=LEFT)
        # self.fm2.pack(side=LEFT, padx=10)

        Label(self).grid(row=0, stick=W, pady=10)
        Label(self, text='请输入: ').grid(row=1, stick=W, pady=10)
        Entry(self, textvariable=self.word, width=40).grid(row=2, stick=W)

        Label(self, text='结果如下: ').grid(row=4, stick=W, pady=10)
        Label(self,
              text='翻译:' + query_res['fanyi'],
              height=2,
              width=40,
              justify='left').grid(row=5, stick=W, pady=10)
        Label(self,
              text='发音:' + query_res['phonetic'],
              height=2,
              width=40,
              justify='left').grid(row=6, stick=W, pady=10)
        Label(self,
              text='其他:' + query_res['translation'],
              height=2,
              width=40,
              justify='left').grid(row=7, stick=W, pady=10)
        # Entry(self, textvariable=self.mean).grid(row=3, column=1, stick=E)
        # Text(self, height=5, width=50).grid(row=4, column=1, stick=W)

        Button(self, text='查询', command=self.query_word).grid(row=10,
                                                              column=1,
                                                              stick=E,
                                                              pady=10)
        Button(self, text='收藏', command=self.collect_word).grid(row=10,
                                                                column=2,
                                                                stick=E,
                                                                pady=10)

    def query_word(self):
        word = self.word.get()

        #是英文
        eng = '1'
        # 是中文
        if '\u4e00' <= word[0] <= '\u9fff':
            eng = '0'

        #先判断在不在当前数据库,不在再用360翻译接口
        sql1 = "select * from words where en_word like '%s'" % (word)
        find_res1 = self.mysqlClient.find_one(sql1)
        sql2 = "select * from words where cn_word like '%s'" % (word)
        find_res2 = self.mysqlClient.find_one(sql2)
        if find_res1:
            query_res = {
                'fanyi': find_res1[2],
                'phonetic': '',
                'translation': '',
            }
            self.createPage(query_res)
        elif find_res2:
            query_res = {
                'fanyi': find_res2[1],
                'phonetic': '',
                'translation': '',
            }
            self.createPage(query_res)
        else:
            query_res = self.spider.get_dic(word, eng=eng)
            if query_res:
                self.createPage(query_res)
            else:
                showinfo(title='查询失败', message='查询失败,请检查您的网络')

        #添加到历史查询记录
        name = config.USERNAME
        mean = json.dumps(query_res['fanyi'])
        mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        timestamp = int(time.time())
        sql = "insert into history(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % (
            word, mean, name, mytime, timestamp)
        print(sql)
        self.mysqlClient.save(sql)

    def collect_word(self):
        word = self.word.get()

        # 是英文
        eng = '1'
        # 是中文
        if '\u4e00' <= word[0] <= '\u9fff':
            eng = '0'

        # 先判断在不在当前数据库,不在再用360翻译接口
        sql1 = "select * from words where en_word like '%s'" % (word)
        find_res1 = self.mysqlClient.find_one(sql1)
        sql2 = "select * from words where cn_word like '%s'" % (word)
        find_res2 = self.mysqlClient.find_one(sql2)
        if find_res1:
            query_res = {
                'fanyi': find_res1[2],
                'phonetic': '',
                'translation': '',
            }
            self.createPage(query_res)
        elif find_res2:
            query_res = {
                'fanyi': find_res2[1],
                'phonetic': '',
                'translation': '',
            }
            self.createPage(query_res)
        else:
            query_res = self.spider.get_dic(word, eng=eng)
            if query_res:
                self.createPage(query_res)
            else:
                showinfo(title='查询失败', message='查询失败,请检查您的网络')

        # 添加到我的收藏
        name = config.USERNAME
        mean = str(query_res['fanyi'])
        mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        timestamp = int(time.time())
        sql = "insert into collection(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % (
            word, mean, name, mytime, timestamp)
        print(sql)
        save_res = self.mysqlClient.save(sql)
        if save_res:
            showinfo(title='成功', message='收藏成功')
        else:
            showinfo(title='失败', message='收藏失败!')
Пример #30
0
from mitmproxy import ctx
import json
from db import MysqlClient
from config import *
import re
import time
import smtplib
from email.mime.text import MIMEText
conn = MysqlClient()
import time


def response(flow):
    url = 'https://api.huafer.cc/api/v1/schizo'
    if flow.request.url.startswith(url):
        text = flow.response.text
        result = json.loads(text)
        if result.get('obj') and result.get('obj').get('items'):
            items = result.get('obj').get('items')
            second = 0
            for x in items:
                second += 1
                if second <= 3:
                    if x.get('item') and x.get('counts') and x.get('user'):
                        info1 = x.get('item')
                        info2 = x.get('counts')
                        info3 = x.get('user')
                        sellers = {}
                        stuff_data = {}
                        sellers['STUFFID'] = info1.get('goodsId')  # 宝贝id
                        sellers['ADDRESS'] = None  #  卖家地址
Пример #31
0
 def __init__(self, website='default'):
     self.website = website
     self.mysql_client = MysqlClient(website)