Exemplo n.º 1
0
    def goubanjia(self):
        """
        :-1: html tag mixed with invalid data
        :100:And the most important thing is the port writed in 'class' rather in text.
        The website is difficult to spider, but the proxys are very goog
        goubanjia proxy http://www.goubanjia.com
        """

        version = begin_time()
        host = "http://www.goubanjia.com"
        html = self.proxy_req(host, 0)

        if not html:
            return []
        trs = html.find_all("tr", class_=["warning", "success"])
        for tr in trs:
            tds = tr.find_all("td")
            ip = tds[2].find_all("a")[0].text + "://"
            iplist = tds[0].find_all(["div", "span", not "p"],
                                     class_=not "port")
            for index in iplist:
                ip += index.text
            encode = tds[0].find_all(["div", "span", "p"],
                                     class_="port")[0]["class"][1]
            uncode = functools.reduce(
                lambda x, y: x * 10 + (ord(y) - ord("A")),
                map(lambda x: x, encode), 0)
            self.waitjudge.append(ip + ":" + str(int(uncode / 8)))
        self.thread_judge()
        end_time(version, 2)
Exemplo n.º 2
0
    def load_collect(self, page):
        """
        load collect
        """
        version = begin_time()
        if not os.path.exists('%scookie_collect' % data_dir):
            print('TB cookie not exist!!!')
            return
        with codecs.open('%scookie_collect' % data_dir, 'r',
                         encoding='utf-8') as f:
            cookie = f.readline()
        changeCookie(cookie[:-1])
        changeHtmlTimeout(30)
        for block in range(page // 10 + 1):
            begin = block * 10
            end = min(page, (block + 1) * 10)
            threadings = []
            for index in range(begin, end):
                work = threading.Thread(target=self.load_collect_once,
                                        args=(index, ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()

        collect = [self.collect[k] for k in sorted(self.collect.keys())]
        collect = sum(collect, [])
        with codecs.open('%scollect_wyy' % data_dir, 'w',
                         encoding='utf-8') as f:
            f.write("\n".join(collect))
        end_time(version)
Exemplo n.º 3
0
    def get_detail(self):
        """
        get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
        """

        version = begin_time()
        threadings = []
        with codecs.open('bjh_href_poison.txt', 'r', encoding='utf-8') as f:
            href_list = f.readlines()
        for index, url in enumerate(href_list):
            work = threading.Thread(
                target=self.detail_once, args=(index, url,))
            threadings.append(work)

        for work in threadings:
            # time.sleep(.5)
            work.start()
        for work in threadings:
            work.join()
        word_list = [self.word_list[k] for k in sorted(self.word_list.keys())]
        with codecs.open('bjh_detail_poison', 'w', encoding='utf-8') as f:
            f.write("\n".join(word_list))
        self.failuredmap = {}
        with codecs.open('bjh.log', 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.fail))
        self.fail = []
        end_time(version)
Exemplo n.º 4
0
    def get_song_detail_thread(self):
        """
        get song detail threadings
        """

        version = begin_time()
        for classify in self.classifylist:
            ids = self.get_list_ids(classify)
            threadings = []
            for oneid in ids:
                work = threading.Thread(target=self.get_song_detail,
                                        args=(oneid[1], ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()
            self.clean_data()
            self.test_song(classify, ids)
            self.songlist = []
            self.songmap = {}
            self.finishlist = []
            self.successtime = 0
            print(classify + ' Over!')
        end_time(version)
Exemplo n.º 5
0
    def goubanjia(self):
        """
        :-1: html tag mixed with invalid data
        :100:And the most important thing is the port writed in 'class' rather in text.
        The website is difficult to spider, but the proxys are very goog
        goubanjia proxy http://www.goubanjia.com
        """

        version = begin_time()
        host = 'http://www.goubanjia.com'
        html = self.proxy_req(host, 0)

        if not html:
            return []
        trs = html.find_all('tr', class_=['warning', 'success'])
        for tr in trs:
            tds = tr.find_all('td')
            ip = tds[2].find_all('a')[0].text + '://'
            iplist = tds[0].find_all(['div', 'span', not 'p'],
                                     class_=not 'port')
            for index in iplist:
                ip += index.text
            encode = tds[0].find_all(['div', 'span', 'p'],
                                     class_='port')[0]['class'][1]
            uncode = functools.reduce(
                lambda x, y: x * 10 + (ord(y) - ord('A')),
                map(lambda x: x, encode), 0)
            self.waitjudge.append(ip + ':' + str(int(uncode / 8)))
        self.threadjude()
        end_time(version)
Exemplo n.º 6
0
    def get_summarization(self):
        """
        get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
        """

        version = begin_time()
        threadings = []
        for index in range(75):
            work = threading.Thread(
                target=self.summarization_once, args=(index,))
            threadings.append(work)

        for work in threadings:
            # time.sleep(.5)
            work.start()
        for work in threadings:
            work.join()
        # self.text_map = self.total_map[0]

        # for index in list(range(1, len(self.total_map))):
        #     for ids in self.total_map[index]:
        #         if ids in self.text_map:
        #             self.text_map[ids] += self.total_map[index][ids]
        #         else:
        #             self.text_map[ids] = self.total_map[index][ids]
        # print(sum(self.text_map))
        word = [self.word[k] for k in sorted(self.word.keys())]
        with codecs.open('test', 'w', encoding='utf-8') as f:
            f.write("\n".join(word))
        end_time(version)
Exemplo n.º 7
0
    def get_summarization(self):
        """
        get summarization from https://www.google.com.hk/search?q=%E6%AF%92%E7%8B%97%E8%82%89&newwindow=1&safe=strict&tbm=nws&ei=FK1KXJ3EJbWx0PEPytmq2AI&start=0&sa=N&ved=0ahUKEwidnv-7p4jgAhW1GDQIHcqsCis4ChDy0wMIRw&biw=1627&bih=427&dpr=2
        """

        version = begin_time()
        threadings = []
        for index in range(25):
            work = threading.Thread(
                target=self.summarization_once, args=(index,))
            threadings.append(work)

        for work in threadings:
            time.sleep(1)
            work.start()
        for work in threadings:
            work.join()

        summarizations = [self.summarizations[k]
                          for k in sorted(self.summarizations.keys())]
        self.summarizations = sum(summarizations, [])

        hrefs = [self.hrefs[k] for k in sorted(self.hrefs.keys())]
        self.hrefs = sum(hrefs, [])
        with codecs.open('google_steal.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.summarizations))
        with codecs.open('google_steal_href.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.hrefs))
        end_time(version)
Exemplo n.º 8
0
 def search_goods(self):
     version = begin_time()
     if not os.path.exists('%swait' % data_dir):
         print('wait file not exist!!!')
         return
     with codecs.open('%swait' % data_dir, 'r', encoding='utf-8') as f:
         wait = f.readlines()
     threadings = []
     for index, goods_name in enumerate(wait):
         work = threading.Thread(target=self.search_goods_once,
                                 args=(
                                     goods_name[:-1],
                                     index,
                                 ))
         threadings.append(work)
     for work in threadings:
         work.start()
         time.sleep(random.randint(5, 9))
     for work in threadings:
         work.join()
     goods_name = [
         self.goods_name[k] for k in sorted(self.goods_name.keys())
     ]
     with codecs.open('%swait_goods' % data_dir, 'w',
                      encoding='utf-8') as f:
         f.write('\n'.join(goods_name))
     end_time(version)
Exemplo n.º 9
0
    def load_goods(self):
        """
        load goods
        """
        version = begin_time()
        if not os.path.exists('%scookie' % data_dir):
            print('Youdao Note cookie not exist!!!')
            return
        with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f:
            cookie = f.readline()
        changeCookie(cookie[:-1])

        threadings = []
        for index, tid in enumerate(self.request_list):
            work = threading.Thread(target=self.load_goods_once,
                                    args=(
                                        index,
                                        tid,
                                    ))
            threadings.append(work)

        for work in threadings:
            work.start()
        for work in threadings:
            work.join()

        goods = [self.goods[k] for k in sorted(self.goods.keys())]
        goods = sum(goods, [])
        with codecs.open('%sgoods' % data_dir, 'w', encoding='utf-8') as f:
            f.write("\n".join(goods))
        end_time(version)
Exemplo n.º 10
0
    def participles_word(self):
        """
        participles word
        """
        version = begin_time()

        for file in self.filelists:
            pkuseg.test(file, file[:-4] + '_pkuseg.txt',
                        model_name='../Model_retrieval/pkuseg', nthread=20)
        end_time(version)
Exemplo n.º 11
0
 def pre_data_list(self, do_pre):
     version = begin_time()
     if do_pre == True:
         self.load_all(0)
         self.load_all(1)
     elif do_pre == 2:
         self.load_all_pickle(0)
         self.load_all_pickle(1)
     else:
         self.load_basic(1)
     end_time(version)
Exemplo n.º 12
0
 def press_threading(self, url, qps, types):
     """
     press url at constant qps
     """
     version = begin_time()
     threadings = []
     for index in range(qps):
         work = threading.Thread(
             target=self.basic_press, args=(url, 0, types))
         threadings.append(work)
     for work in threadings:
         work.start()
     for work in threadings:
         work.join()
     end_time(version)
Exemplo n.º 13
0
    def have_places(self):
        """
        brush class
        """
        version = begin_time()
        have_places = False

        while not have_places:
            if self.have_places_once():
                send_email('大数据专题', '大数据专题 有名额啦 有名额啦')
                send_email('大数据专题', '大数据专题 有名额啦 有名额啦')
                send_email('大数据专题', '大数据专题 有名额啦 有名额啦')
                have_places = True
            time.sleep(random.randint(10, 20))
        end_time(version)
Exemplo n.º 14
0
def load_index():
    ''' load index '''
    global movie_list
    version = begin_time()
    text = proxy_req(HOMEPAGE_URL, 3)
    if not len(text):
        if can_retry(HOMEPAGE_URL):
            load_index()
        return
    movie_list = re.findall('《(.*?)》', text)
    movie_more = re.findall('href="(.*?)">更多', text)
    for uri in movie_more:
        load_other(uri)

    threading_list = [threading.Thread(
        target=load_other, args=(ii,)) for ii in movie_another]
    shuffle_batch_run_thread(threading_list, 100)
    threading_list = [threading.Thread(
        target=load_other, args=(ii,)) for ii in movie_again]
    shuffle_batch_run_thread(threading_list, 100)
    # 对电影列表去重
    movie_list = set(movie_list)
    # 导出爬取的 电影列表
    out_path = 'dytt8_result.txt'
    with open(out_path, 'w') as f:
        f.write('\n'.join(movie_list))
    url_num = len([*movie_more, *movie_another]) + 1
    movie_num = len(movie_list)
    echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
            url_num, movie_num, out_path, end_time(version, 0)))
Exemplo n.º 15
0
    def kuaidaili(self, page: int):
        """
        kuaidaili https://www.kuaidaili.com/free/
        """

        version = begin_time()
        threadings = []
        for index in range(1, page + 1):
            work = threading.Thread(target=self.kuaidailithread, args=(index,))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.thread_judge()
        end_time(version, 2)
Exemplo n.º 16
0
    def test_db(self, types: int):
        """ test proxy in db can use """

        version = begin_time()
        typestr = ""
        if types == 2:
            typestr = "(0,1,2,3)"
        elif types == 1:
            typestr = "(1,3)"
        else:
            typestr = "(0,2)"
        results = self.Db.select_db(self.select_all % typestr)
        if results:
            for index in results:
                self.waitjudge.append(index[0])
            self.thread_judge()
        self.init_proxy()
        end_time(version, 2)
Exemplo n.º 17
0
    def sixsixip(self, area: int, page: int):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        version = begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                echo("2|debug", "{} {}".format(index, pageindex))
                work = threading.Thread(target=self.sixsixthread,
                                        args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.thread_judge()
        end_time(version, 2)
Exemplo n.º 18
0
    def sixsixip(self, area, page):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        version = begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                echo(2, str(index) + ' ' + str(pageindex))
                work = threading.Thread(target=self.sixsixthread,
                                        args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)
Exemplo n.º 19
0
    def get_movie_lists(self):
        ''' get movie list '''

        version = begin_time()
        movie_get = []
        for kk in range(0, 1100, 100):
            for jj in self.sort_list:
                for ii in self.tag_movie:
                    movie_get.append(threading.Thread(
                        target=self.get_movie_lists_once, args=('movie', ii, jj, kk,)))
                for ii in self.tag_tv:
                    movie_get.append(threading.Thread(
                        target=self.get_movie_lists_once, args=('tv', ii, jj, kk,)))
        shuffle_batch_run_thread(movie_get, 500, True)
        again_list = [threading.Thread(target=self.get_movie_lists_once, args=(
            ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list]
        shuffle_batch_run_thread(again_list, 500, True)
        self.again_list = []
        echo(1, len(self.movie_id2name.keys()))

        changeHtmlTimeout(40)
        movie_get = []
        tag_categories = self.tag_categories
        for mm in range(0, 10000, 1000):
            for tags in tag_categories[0][1:]:
                for genres in tag_categories[1][1:]:
                    for ii, jj in self.yearMap.values():
                        year_range = '{},{}'.format(ii, jj)
                        for sorts in self.tabs:
                            movie_get.append(threading.Thread(
                                target=self.get_movie_list_from_tabs, args=(sorts, tags, genres, year_range, mm,)))
        echo(2, 'Thread Num:', len(movie_get))
        shuffle_batch_run_thread(movie_get, 900, True)
        again_list = [threading.Thread(target=self.get_movie_list_from_tabs, args=(
            ii[0], ii[1], ii[2], ii[3], ii[4],)) if len(ii) == 5 else threading.Thread(target=self.get_movie_lists_once, args=(
                ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list]
        shuffle_batch_run_thread(again_list, 900, True)
        time.sleep(120)
        changeJsonTimeout(10)
        for ii in self.rank_list:
            self.get_movie_rank(ii, 0)
            if ii == 'top250':
                self.get_movie_rank(ii, 100)
                self.get_movie_rank(ii, 200)

        movie_list = self.movie_id2name.keys()
        output_path = '{}douban_movie_id'.format(data_dir)
        with open(output_path + '.txt', 'w') as f:
            f.write('\n'.join([str(ii) for ii in movie_list]))
        dump_bigger(self.movie_id2name, output_path + '.pkl')

        movie_num = len(movie_list)
        echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
            movie_num, output_path, end_time(version, 0)))
Exemplo n.º 20
0
    def data5u(self):
        """
        data5u proxy http://www.data5u.com/
        no one can use
        """

        version = begin_time()
        url_list = ["", "free/gngn/index.shtml", "free/gwgn/index.shtml"]
        host = "http://www.data5u.com/"
        for uri in url_list:
            html = self.proxy_req(host + uri, 0)
            if not html:
                continue
            table = html.find_all("ul", class_="l2")
            for index in table:
                tds = index.find_all("li")
                ip = tds[3].text
                self.waitjudge.append("{}://{}:{}".format(ip, tds[1].text, tds[2].text))
        self.thread_judge()
        end_time(version, 2)
Exemplo n.º 21
0
    def xici_proxy(self, page: int):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            echo("0|warning", "Please input num!")
            return []

        version = begin_time()
        url = "http://www.xicidaili.com/nn/%d"
        for index in range(1, page + 1):
            html = basic_req(url % index, 0)
            tem = html.find_all("tr")
            for index in range(1, len(tem)):
                tds = tem[index].find_all("td")
                ip = tds[5].text.lower()
                self.waitjudge.append("{}://{}:{}".format(ip, tds[1].text, tds[2].text))
        self.thread_judge()
        end_time(version, 2)
Exemplo n.º 22
0
    def data5u(self):
        """
        data5u proxy http://www.data5u.com/
        no one can use
        """

        version = begin_time()
        url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml']
        host = 'http://www.data5u.com/'
        for uri in url_list:
            html = self.proxy_req(host + uri, 0)
            if not html:
                continue
            table = html.find_all('ul', class_='l2')
            for index in table:
                tds = index.find_all('li')
                ip = tds[3].text
                self.waitjudge.append(ip + '://' + tds[0].text + ':' +
                                      tds[1].text)
        self.threadjude()
        end_time(version)
Exemplo n.º 23
0
    def xiciproxy(self, page):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            echo(0, "Please input num!")
            return []

        version = begin_time()
        url = 'http://www.xicidaili.com/nn/%d'
        for index in range(1, page + 1):
            html = basic_req(url % (index), 0)
            tem = html.find_all('tr')
            for index in range(1, len(tem)):
                tds = tem[index].find_all('td')
                ip = tds[5].text.lower()
                self.waitjudge.append(ip + '://' + tds[1].text + ':' +
                                      tds[2].text)
        self.threadjude()
        end_time(version)
Exemplo n.º 24
0
    def get_href(self):
        """
        get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
        """

        version = begin_time()
        threadings = []
        for index in range(71):
            work = threading.Thread(
                target=self.href_once, args=(index,))
            threadings.append(work)

        for work in threadings:
            # time.sleep(.5)
            work.start()
        for work in threadings:
            work.join()
        href_map = [self.href_map[k] for k in sorted(self.href_map.keys())]
        self.href_map = sum(href_map, [])
        with codecs.open('bjh_href_poison.txt', 'w', encoding='utf-8') as f:
            f.write("\n".join(self.href_map))
        end_time(version)
Exemplo n.º 25
0
    def testdb(self, types):
        '''
        test proxy in db can use
        '''

        version = begin_time()
        typestr = ''
        if types == 2:
            typestr = '(0,1,2,3)'
        elif types == 1:
            typestr = '(1,3)'
        else:
            typestr = '(0,2)'
        results = self.Db.select_db(self.select_all % typestr)
        if results != 0:
            for index in results:
                self.waitjudge.append(index[0])
            self.threadjude()
        else:
            pass
        self.initproxy()
        end_time(version)
Exemplo n.º 26
0
    def build_md(self, load_img=False):
        """
        build md
        """
        version = begin_time()

        threadings = []
        for index, tid in enumerate(self.request_list):
            work = threading.Thread(target=self.build_md_once,
                                    args=(
                                        index,
                                        tid,
                                    ))
            threadings.append(work)

        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        if not load_img:
            return
        img_map = {k: self.img_map[k] for k in sorted(self.img_map.keys())}
        img_threadings = []
        for index in img_map.keys():
            for img_id, img_url in enumerate(img_map[index]):
                work = threading.Thread(target=self.load_img,
                                        args=(
                                            index,
                                            img_id,
                                            img_url,
                                        ))
                img_threadings.append(work)
        for work in img_threadings:
            work.start()
        for work in img_threadings:
            work.join()

        end_time(version)
Exemplo n.º 27
0
 def load_proxies_test(self):
     """ load mode & test proxies """
     version = begin_time()
     self.load_proxies_list()
     proxies_len = len(self.waitjudge)
     self.thread_judge()
     canuse_len = len(self.canuse_proxies)
     echo(
         "1|info",
         "\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {}\n".
         format(proxies_len, canuse_len, end_time(version)),
     )
     with open("{}canuse_proxies.txt".format(data_dir), "w") as f:
         f.write("\n".join(self.canuse_proxies))
Exemplo n.º 28
0
    def get_classify(self):
        """
        get classify from /discover/playlist
        """

        version = begin_time()
        self.classifylist = {}
        host = 'https://music.163.com/discover/playlist'
        html = proxy_req(host, 0)

        if not html:
            print('Empty')
            if can_retry(host):
                self.get_classify()
            return []

        alist = html.find_all('a', class_='s-fc1')
        if not len(alist):
            if can_retry(host):
                self.get_classify()
            print(html)
        for index in alist:
            self.classifylist[index.text] = index['href']
        end_time(version)
Exemplo n.º 29
0
    def get_summarization(self):
        """
        get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
        """

        version = begin_time()
        threadings = []
        for index in range(30):
            work = threading.Thread(
                target=self.summarization_once, args=(index,))
            threadings.append(work)

        for work in threadings:
            # time.sleep(.5)
            work.start()
        for work in threadings:
            work.join()

        summarizations = [self.summarizations[k]
                          for k in sorted(self.summarizations.keys())]
        self.summarizations = sum(summarizations, [])
        with codecs.open('news_posion.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.summarizations))
        end_time(version)
Exemplo n.º 30
0
    def get_movie_lists(self):
        ''' get movie list '''

        version = begin_time()
        movie_get = []
        for ii in self.tag:
            for jj in self.sort_list:
                movie_get.append(threading.Thread(
                    target=self.get_movie_lists_once, args=('movie', ii, jj, 0,)))
        for ww in movie_get:
            ww.start()
        for ww in movie_get:
            ww.join()
        movie_list = set(sum(self.movie_id_dict.values(), []))
        output_path = '{}douban_movie_id.txt'.format(data_dir)
        with open(output_path, 'w') as f:
            f.write('\n'.join([str(ii) for ii in movie_list]))
        movie_num = len(movie_list)
        echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
            movie_num, output_path, end_time(version, 0)))