Пример #1
0
 def __init__(self):
     self.Db = Db()
     self.classifylist = {}
     self.proxyclass = GetFreeProxy()
     self.playlists = []
     self.failuredmap = {}
     self.songmap = {}
     self.songlist = []
     self.finishlist = []
     self.get_classify()
     self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' '''
     self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 '''
     self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' '''
     self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s'''
     self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)'''  # change to your file absolute address
     self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s'''
     self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''
Пример #2
0
class Press_test():
    """
    give press in short time
    """
    def __init__(self):
        self.proxyclass = GetFreeProxy()

    def basic_press(self, url, host, times, types):
        """
        press have no data input
        """
        if types == 1:
            html = self.proxyclass.get_request_proxy(url, host, 0)
        else:
            html = get_html(url, {}, host)

        if html == False and times < 5:
            self.basic_press(url, host, times + 1, types)

    def press_threading(self, url, host, qps, types):
        """
        press url at constant qps
        """
        begin_time()
        threadings = []
        for index in range(qps):
            work = threading.Thread(target=self.basic_press,
                                    args=(url, host, 0, types))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        end_time()

    def one_press_attack(self, url, host, qps, types, total):
        """
        press url from a long time
        """
        for index in range(total):
            self.press_threading(url, host, qps, types)
        print('Over')
Пример #3
0
 def __init__(self):
     self.proxyclass = GetFreeProxy()
Пример #4
0
 def update_proxy(self):
     global proxy_req
     proxy_req = GetFreeProxy().proxy_req
Пример #5
0
import shutil
import sys
import threading
import time
from configparser import ConfigParser
from typing import List

import numpy as np

sys.path.append(os.getcwd())
from proxy.getproxy import GetFreeProxy
from util.util import (basic_req, begin_time, can_retry, changeHeaders, echo,
                       end_time, headers, mkdir, read_file, send_email,
                       time_stamp, time_str)

proxy_req = GetFreeProxy().proxy_req
one_day = 86400
data_dir = 'bilibili/data/'
history_data_dir = '{}history_data/'.format(data_dir)
history_dir = '{}history/'.format(data_dir)
comment_dir = '{}comment/'.format(data_dir)
assign_path = 'bilibili/assign_up.ini'
"""
  * bilibili @http
  * www.bilibili.com/video/av{av_id}
  * www.bilibili.com/ranking/all/155/{0/1}/{day}
  * space.bilibili.com/ajax/member/getSubmitVideos?mid={mid}&page=1&pagesize=50
  * api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=%d&type=1&oid=%d&sort=0
    api.bilibili.com/x/report/click/now?jsonp=jsonp
    api.bilibili.com/x/report/click/web/h5
    api.bilibili.com/x/report/web/heartbeat
Пример #6
0
# -*- coding: utf-8 -*-
# @Author: gunjianpan
# @Date:   2019-02-28 09:47:07
# @Last Modified by:   gunjianpan
# @Last Modified time: 2019-03-28 00:16:05

import codecs
import re

from proxy.getproxy import GetFreeProxy
from utils.utils import begin_time, end_time, can_retry

get_request_proxy = GetFreeProxy().get_request_proxy
"""
  * zimuzu @http
  * zmz005.com/o5itP3
"""


class southPark(object):
    """
    load download South Park url from zimuzu
    """
    def load_url(self):
        """
        load url form zimuzu
        """

        url = 'http://zmz005.com/o5itP3'
        detail = get_request_proxy(url, 0)
        total = []
Пример #7
0
class Get_playlist_song():
    """
    1. get playlist id from classify;
    2. get song from play list;
    use url:
    """
    def __init__(self):
        self.Db = Db()
        self.classifylist = {}
        self.proxyclass = GetFreeProxy()
        self.playlists = []
        self.failuredmap = {}
        self.songmap = {}
        self.songlist = []
        self.finishlist = []
        self.get_classify()
        self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' '''
        self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 '''
        self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' '''
        self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s'''
        self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)'''  # change to your file absolute address
        self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s'''
        self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''

    def get_classify(self):
        """
        get classify from /discover/playlist
        """

        begin_time()
        self.classifylist = {}
        host = 'https://music.163.com/discover/playlist'
        html = self.proxyclass.get_request_proxy(host, host[8:21], 0)

        if not html:
            print('Empty')
            self.proxyclass.cleancannotuse()
            if self.can_retry(host):
                self.get_classify()
            return []

        alist = html.find_all('a', class_='s-fc1')
        if not len(alist):
            if self.can_retry(host):
                self.get_classify()
            print(html)
        for index in alist:
            self.classifylist[index.text] = index['href']
        self.proxyclass.cleancannotuse()
        end_time()

    def get_playlist_id(self, classify, offset):
        """
        get playlist id from classify
        """

        host = 'https://music.163.com'
        allclassify = classify == '全部风格'
        url = host + self.classifylist[classify] + (
            '?' if allclassify else
            '&') + 'order=hot&limit=35&offset=' + str(offset)
        # html = self.proxyclass.get_request_proxy(url, host[8:], 0)
        html = get_html(url, {}, host[8:])

        if not html:
            if self.can_retry(url):
                self.get_playlist_id(classify, offset)
            else:
                self.proxyclass.log_write(url)
            return []
        alist = html.find_all('a', class_='icon-play')
        if not len(alist):
            if self.can_retry(url):
                self.get_playlist_id(classify, offset)
            else:
                self.proxyclass.log_write(url)
        for index in alist:
            self.playlists.append(index['data-res-id'])

    def can_retry(self, url):
        """
        judge can retry once
        """

        if url not in self.failuredmap:
            self.failuredmap[url] = 0
            # print("Retry " + str(self.failuredmap[url]) + ' ' + url)
            return True
        elif self.failuredmap[url] < 2:
            self.failuredmap[url] += 1
            # print("Retry " + str(self.failuredmap[url]) + ' ' + url)
            return True
        else:
            print("Failured " + url)
            self.proxyclass.log_write(url)
            self.failuredmap[url] = 0
            return False

    def get_playlist_id_thread(self):
        """
        get play list id in threading
        """

        begin_time()
        if not len(self.classifylist):
            self.get_classify()

        for index in self.classifylist:
            threadings = []
            for offset in range(41):
                work = threading.Thread(target=self.get_playlist_id,
                                        args=(
                                            index,
                                            offset * 35,
                                        ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()
            self.proxyclass.cleancannotuse()
            print(len(self.playlists))
            self.test_queue(index)
            self.playlists = []
            print(index + " Over")
        end_time()

    def test_queue(self, classify):
        """
        test data if in playlist_queue
        """
        if len(self.playlists) == 1:
            waitlist = '(' + str(self.playlists[0]) + ')'
        else:
            waitlist = tuple(self.playlists)
        results = self.Db.select_db(self.select_one %
                                    (str(waitlist), classify))
        if not results:
            return []
        hadexist = []
        for index in results:
            hadexist.append(index[0])
        insertlist = []
        for index in self.playlists:
            if index not in hadexist:
                # file_d.write(str([index, classify])[1:-1] + '\n')
                insertlist.append((index, classify))
        print('Insert ' + str(len(insertlist)) + ' ' + classify)
        self.insert_queue(insertlist)

    def insert_queue(self, ids):
        """
        insert data to playlist_queue
        """

        if not len(ids):
            return []
        results = self.Db.insert_db(self.insert_sql % str(ids)[1:-1])
        if results:
            if len(ids):
                print('Insert ' + ids[0][1] + ' ' + str(len(ids)) +
                      ' Success!')
        else:
            pass

    def get_list_ids(self, classify):
        """
        get list ids from db
        """
        results = self.Db.select_db(self.select_ids % classify)
        ids = []
        if results:
            for index in results:
                ids.append([index[0], index[1]])
        return ids

    def get_song_detail_thread(self):
        """
        get song detail threadings
        """

        begin_time()
        for classify in self.classifylist:
            ids = self.get_list_ids(classify)
            threadings = []
            for oneid in ids:
                work = threading.Thread(target=self.get_song_detail,
                                        args=(oneid[1], ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()
            self.clean_data()
            self.test_song(classify, ids)
            self.songlist = []
            self.songmap = {}
            self.finishlist = []
            self.successtime = 0
            print(classify + ' Over!')
        end_time()

    def clean_data(self):
        """
        aggregation data
        """
        for song in self.songlist:
            [songid, songname, playcount] = song
            if songid not in self.songmap:
                self.songmap[songid] = [1, playcount, songname]
            else:
                orgin = self.songmap[songid]
                self.songmap[songid] = [
                    orgin[0] + 1, orgin[1] + playcount, songname
                ]

    def get_song_detail(self, id):
        """
        get song detail form playlist
        """

        host = 'http://music.163.com/api/playlist/detail?id=' + str(id)
        json = self.proxyclass.get_request_proxy(host, host[7:20], 1)
        if json == 0:
            if self.can_retry(host):
                self.get_song_detail(id)
            else:
                self.proxyclass.log_write(host)
            return []
        result = json['result']
        tracks = result['tracks']

        if len(tracks) <= 1:
            if self.can_retry(host):
                self.get_song_detail(id)
            else:
                self.proxyclass.log_write(host)
                return []
        else:
            playcount = result['playCount']
            for track in tracks:
                songid = track['id']
                songname = track['name']
                self.songlist.append([songid, songname, playcount])
            self.finishlist.append(id)

    def test_song(self, classify, ids):
        """
        test song if in db
        """
        songs = []
        for song in self.songmap:
            songs.append(song)
        if not len(songs):
            return []
        elif len(songs) == 1:
            waitlist = '(' + songs[0] + ')'
        else:
            waitlist = tuple(songs)
        results = self.Db.select_db(self.select_song %
                                    (str(waitlist), classify))
        resultmap = {}
        for detail in results:
            resultmap[detail[1]] = [detail[0], detail[2], detail[3]]

        replacelist = []
        insertlist = []
        replacequeue = []
        file_d = codecs.open("song_detail", 'a', encoding='utf-8')
        file_d.seek(0)
        file_d.truncate()
        idsmap = {}
        for indexid in ids:
            idsmap[indexid[1]] = indexid[0]
        for song in self.songmap:
            songdetail = self.songmap[song]
            if song in resultmap:
                dbdetail = resultmap[song]
                replacelist.append(
                    (dbdetail[0], song, classify, songdetail[2],
                     songdetail[0] + dbdetail[1], songdetail[1] + dbdetail[2]))
            else:
                file_d.write(u'' + str([
                    song, u'' + str(u'' + songdetail[2].replace(',', ' '))
                    [0:20], classify, songdetail[0], songdetail[1]
                ])[1:-1] + '\n')
                insertlist.append((song, songdetail[2], classify,
                                   songdetail[0], songdetail[1]))
        for playlist in self.finishlist:
            replacequeue.append((idsmap[playlist], playlist, classify, 1))
        file_d.close()
        if len(insertlist):
            self.db_song_detail(insertlist, 'Insert', replacequeue)
        if len(replacelist):
            self.db_song_detail(replacelist, 'Update', [])

    def db_song_detail(self, waitlist, types, replacequeue):
        """
        batch insert/update song detail
        """

        if types == 'Update':
            results = self.Db.update_db(self.replace_song %
                                        str(blocklist)[1:-1])
        else:
            results = self.Db.update_db(self.insert_song)
        if results:
            if len(waitlist):
                print(types + ' song detail for ' + waitlist[0][2] + ' ' +
                      str(len(waitlist)) + ' Success!')
            if types == 'Insert':
                self.replace_queue_db(replacequeue)

    def replace_queue_db(self, replacequeue):
        """
        replace db for fininsh playlist id
        """

        results = self.Db.update_db(self.replace_queue %
                                    str(replacequeue)[1:-1])
        if results:
            if len(replacequeue):
                print('Update queue fininsh for ' + str(len(replacequeue)) +
                      ' item!')
        else:
            pass
Пример #8
0
 def __init__(self):
     super(BasicBilibili, self).__init__()
     self.proxy_req = GetFreeProxy().proxy_req
     self.del_map = {}
     self.rank_map = {}
     self.load_configure()
Пример #9
0
 def update_proxy_basic(self):
     self.proxy_req = GetFreeProxy().proxy_req