Exemplo n.º 1
0
    def threadjude(self, batch_size=500):
        """
        threading to judge proxy
        """
        changeJsonTimeout(2)
        changeHtmlTimeout(3)

        text = self.waitjudge
        num = len(text)
        for block in range(num // batch_size + 1):
            blockthreads = []
            for index in range(block * batch_size,
                               min(num, batch_size * (block + 1))):
                work = threading.Thread(target=self.judgeurl,
                                        args=(
                                            text[index],
                                            index,
                                            0,
                                        ))
                blockthreads.append(work)
            for work in blockthreads:
                work.start()
            for work in blockthreads:
                work.join()
            self.dbcanuseproxy()
            self.cleancannotuse()
        self.waitjudge = []
Exemplo n.º 2
0
    def load_collect(self, page):
        """
        load collect
        """
        version = begin_time()
        if not os.path.exists('%scookie_collect' % data_dir):
            print('TB cookie not exist!!!')
            return
        with codecs.open('%scookie_collect' % data_dir, 'r',
                         encoding='utf-8') as f:
            cookie = f.readline()
        changeCookie(cookie[:-1])
        changeHtmlTimeout(30)
        for block in range(page // 10 + 1):
            begin = block * 10
            end = min(page, (block + 1) * 10)
            threadings = []
            for index in range(begin, end):
                work = threading.Thread(target=self.load_collect_once,
                                        args=(index, ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()

        collect = [self.collect[k] for k in sorted(self.collect.keys())]
        collect = sum(collect, [])
        with codecs.open('%scollect_wyy' % data_dir, 'w',
                         encoding='utf-8') as f:
            f.write("\n".join(collect))
        end_time(version)
Exemplo n.º 3
0
    def get_movie_lists(self):
        ''' get movie list '''

        version = begin_time()
        movie_get = []
        for kk in range(0, 1100, 100):
            for jj in self.sort_list:
                for ii in self.tag_movie:
                    movie_get.append(threading.Thread(
                        target=self.get_movie_lists_once, args=('movie', ii, jj, kk,)))
                for ii in self.tag_tv:
                    movie_get.append(threading.Thread(
                        target=self.get_movie_lists_once, args=('tv', ii, jj, kk,)))
        shuffle_batch_run_thread(movie_get, 500, True)
        again_list = [threading.Thread(target=self.get_movie_lists_once, args=(
            ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list]
        shuffle_batch_run_thread(again_list, 500, True)
        self.again_list = []
        echo(1, len(self.movie_id2name.keys()))

        changeHtmlTimeout(40)
        movie_get = []
        tag_categories = self.tag_categories
        for mm in range(0, 10000, 1000):
            for tags in tag_categories[0][1:]:
                for genres in tag_categories[1][1:]:
                    for ii, jj in self.yearMap.values():
                        year_range = '{},{}'.format(ii, jj)
                        for sorts in self.tabs:
                            movie_get.append(threading.Thread(
                                target=self.get_movie_list_from_tabs, args=(sorts, tags, genres, year_range, mm,)))
        echo(2, 'Thread Num:', len(movie_get))
        shuffle_batch_run_thread(movie_get, 900, True)
        again_list = [threading.Thread(target=self.get_movie_list_from_tabs, args=(
            ii[0], ii[1], ii[2], ii[3], ii[4],)) if len(ii) == 5 else threading.Thread(target=self.get_movie_lists_once, args=(
                ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list]
        shuffle_batch_run_thread(again_list, 900, True)
        time.sleep(120)
        changeJsonTimeout(10)
        for ii in self.rank_list:
            self.get_movie_rank(ii, 0)
            if ii == 'top250':
                self.get_movie_rank(ii, 100)
                self.get_movie_rank(ii, 200)

        movie_list = self.movie_id2name.keys()
        output_path = '{}douban_movie_id'.format(data_dir)
        with open(output_path + '.txt', 'w') as f:
            f.write('\n'.join([str(ii) for ii in movie_list]))
        dump_bigger(self.movie_id2name, output_path + '.pkl')

        movie_num = len(movie_list)
        echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
            movie_num, output_path, end_time(version, 0)))
Exemplo n.º 4
0
    def thread_judge(self, batch_size: int = 500):
        """ threading to judge proxy """
        changeJsonTimeout(2)
        changeHtmlTimeout(3)

        proxy_exec = ThreadPoolExecutor(max_workers=batch_size // 2)
        text = self.waitjudge
        num = len(text)
        for block in range(num // batch_size + 1):
            proxy_th = [
                proxy_exec.submit(self.judge_url, jj, ii, 0)
                for ii, jj in enumerate(text[block * batch_size:batch_size *
                                             (block + 1)])
            ]
            list(as_completed(proxy_th))
            self.db_can_use_proxy()
            self.clean_cannot_use()
        self.waitjudge = []
Exemplo n.º 5
0
 def update_view(self):
     changeHtmlTimeout(10)
     wait_map = {}
     self.select_all()
     self.getZhihuView()
     self.getJianshuViews()
     self.getCsdnViews()
     for index in self.zhihu_views.keys():
         if self.zhihu_views[index] == self.exist_data[index][
                 3] and self.zhihu_id[index] == self.exist_data[index][6]:
             continue
         wait_map[index] = self.exist_data[index]
         wait_map[index][3] = self.zhihu_views[index]
         wait_map[index][6] = self.zhihu_id[index]
     for index in self.csdn_views.keys():
         if self.csdn_views[index] == self.exist_data[index][
                 4] and self.csdn_id[index] == self.exist_data[index][7]:
             continue
         if index not in wait_map:
             wait_map[index] = self.exist_data[index]
         wait_map[index][4] = self.csdn_views[index]
         wait_map[index][7] = self.csdn_id[index]
     for index in self.jianshu_views.keys():
         if self.jianshu_views[index] == self.exist_data[index][
                 5] and self.jianshu_id[index] == self.exist_data[index][8]:
             continue
         wait_map[index] = self.exist_data[index]
         wait_map[index][5] = self.jianshu_views[index]
         wait_map[index][8] = self.jianshu_id[index]
     update_list = [tuple(index) for index in wait_map.values()]
     # return update_list:q
     if not len(update_list):
         return
     results = self.Db.update_db(self.update_sql % str(update_list)[1:-1])
     if results:
         if len(update_list):
             print('Update ' + str(len(update_list)) + ' Success!')
     else:
         pass
Exemplo n.º 6
0
    def match_goods(self):

        self.headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Cookie': '',
            'Content-Type': get_content_type(),
            'Accept': get_accept('xhr')
        }

        version = begin_time()
        changeHtmlTimeout(30)
        block_size = 10
        if not os.path.exists('%sgoods' % data_dir):
            print('goods file not exist!!!')
            return
        with codecs.open('%sgoods' % data_dir, 'r', encoding='utf-8') as f:
            wait_goods = f.readlines()
        goods_url = [
            re.findall('http.* ', index)[0].strip().replace('https', 'http')
            if 'http' in index and not '【' in index else False
            for index in wait_goods
        ]

        if not os.path.exists('%scollect_wyy' % data_dir):
            print('collect file not exist!!!')
            return
        with codecs.open('%scollect_wyy' % data_dir, 'r',
                         encoding='utf-8') as f:
            collect = f.readlines()
        self.title2map = {
            index.split("||")[1]: index.split("||")[0]
            for index in collect
        }

        threadings = []
        for index, url in enumerate(goods_url):
            if url == False:
                continue
            work = threading.Thread(target=self.get_goods_id_first,
                                    args=(
                                        url,
                                        index,
                                    ))
            threadings.append(work)
        url_len = len(threadings)
        for index in range((url_len - 1) // block_size + 1):
            begin_id = index * block_size
            end_id = min(url_len, (index + 1) * block_size)
            threadings_block = threadings[begin_id:end_id]

            for work in threadings_block:
                work.start()
            for work in threadings_block:
                work.join()

            time.sleep(random.randint(0, 9))

        write_body = [
            ' '.join([self.goods_map[index], body]) if index in self.goods_map
            else (' '.join([self.url2goods[goods_url[index]], body])
                  if goods_url[index] in self.url2goods else body)
            for index, body in enumerate(wait_goods)
        ]
        with codecs.open('%sgoods_one' % data_dir, 'w', encoding='utf-8') as f:
            f.write(''.join(write_body))
        end_time(version)
Exemplo n.º 7
0
    def match_goods(self):

        self.headers = {
            'pragma':
            'no-cache',
            'X-Requested-With':
            'XMLHttpRequest',
            'cache-control':
            'no-cache',
            'Cookie':
            '',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }

        version = begin_time()
        changeHtmlTimeout(30)
        block_size = 10
        if not os.path.exists('%sgoods' % data_dir):
            print('goods file not exist!!!')
            return
        with codecs.open('%sgoods' % data_dir, 'r', encoding='utf-8') as f:
            wait_goods = f.readlines()
        goods_url = [
            re.findall('http.* ', index)[0].strip().replace('https', 'http')
            if 'http' in index and not '【' in index else False
            for index in wait_goods
        ]

        if not os.path.exists('%scollect_wyy' % data_dir):
            print('collect file not exist!!!')
            return
        with codecs.open('%scollect_wyy' % data_dir, 'r',
                         encoding='utf-8') as f:
            collect = f.readlines()
        self.title2map = {
            index.split("||")[1]: index.split("||")[0]
            for index in collect
        }

        threadings = []
        for index, url in enumerate(goods_url):
            if url == False:
                continue
            work = threading.Thread(target=self.get_goods_id_first,
                                    args=(
                                        url,
                                        index,
                                    ))
            threadings.append(work)
        url_len = len(threadings)
        for index in range((url_len - 1) // block_size + 1):
            begin_id = index * block_size
            end_id = min(url_len, (index + 1) * block_size)
            threadings_block = threadings[begin_id:end_id]

            for work in threadings_block:
                work.start()
            for work in threadings_block:
                work.join()

            time.sleep(random.randint(0, 9))

        write_body = [
            ' '.join([self.goods_map[index], body]) if index in self.goods_map
            else (' '.join([self.url2goods[goods_url[index]], body])
                  if goods_url[index] in self.url2goods else body)
            for index, body in enumerate(wait_goods)
        ]
        with codecs.open('%sgoods_one' % data_dir, 'w', encoding='utf-8') as f:
            f.write(''.join(write_body))
        end_time(version)