Пример #1
0
    def load(cls):
        if cls.__load__cache__(PathConfig.RAW_KES_CACHE):
            print("Raw KES data cache has been loaded from %s\n" %
                  PathConfig.RAW_KES_CACHE)
            return cls

        db = Db()
        rows = db.select(["new_eventid", "sub_eventid", "text_dbpedia"])
        random.shuffle(rows)
        row_count = len(rows)

        train_rows, test_rows, validation_rows = \
            rows[0: int(row_count * 0.45)], \
            rows[int(row_count * 0.45): int(row_count * 0.95)], \
            rows[int(row_count * 0.95):]

        bar = ProgressBar(
            cls.__self__accumulating__(len(train_rows) - 1) +
            cls.__self__accumulating__(len(test_rows) - 1) +
            cls.__self__accumulating__(len(validation_rows) - 1),
            "Loading raw KES data...")
        progress = 0

        for rows_splitting, set_name in (train_rows,
                                         "train"), (test_rows,
                                                    "test"), (validation_rows,
                                                              "validation"):
            cls.__fill__(rows_splitting, len(rows_splitting), set_name, bar,
                         progress)

        bar.finish("Raw KES data has been loaded.")
        cls.__cache__(PathConfig.RAW_KES_CACHE)

        return cls
Пример #2
0
class DbTestCase(unittest.TestCase):
    def setUp(self):
        self.db_to_test = Db()
        pass

    def test_new_user(self):
        self.db_to_test.cursor = MagicMock()
        self.db_to_test.new_user("test_user")
        self.db_to_test.cursor.execute.assert_called_with("INSERT INTO user (name,balance,cc_number) VALUES (?,?,?)", ("test_user",0,None))
Пример #3
0
class DbTestCase(unittest.TestCase):
    def setUp(self):
        self.db_to_test = Db()
        pass

    def test_new_user(self):
        self.db_to_test.cursor = MagicMock()
        self.db_to_test.new_user("test_user")
        self.db_to_test.cursor.execute.assert_called_with(
            "INSERT INTO user (name,balance,cc_number) VALUES (?,?,?)",
            ("test_user", 0, None))
Пример #4
0
 def __init__(self):
     self.Db = Db("netease")
     self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s '''
     self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0'''
     self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s '''
     self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s'''
     self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s'''
     self.canuseip = {}
     self.waitjudge = []
     self.cannotuseip = {}
     self.failuredtime = {}
     self.initproxy()
Пример #5
0
    def __init__(self):
        self.Db = Db("china_regions")
        china = pd.read_csv('news/china_city_list.csv', encoding='gbk')
        self.province = list(china.groupby(by=['Province']).count().axes[0])
        self.city = list(china.groupby(by=['City']).count().axes[0])
        self.filelists = [
            'google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail',
            'bjh_detail_poison', 'news_steal.txt', 'news_poison.txt'
        ]
        self.city_province = {}
        self.province_map = {}

        self.pre_data()
        for index, row in china.iterrows():
            self.city_province[row['City']] = row['Province']
Пример #6
0
 def __init__(self):
     self.Db = Db("netease")
     self.classifylist = {}
     self.playlists = []
     self.failuredmap = {}
     self.songmap = {}
     self.songlist = []
     self.finishlist = []
     self.get_classify()
     self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' '''
     self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 '''
     self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' '''
     self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s'''
     self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)'''  # change to your file absolute address
     self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s'''
     self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''
Пример #7
0
 def __init__(self):
     self.Db = Db("blog")
     self.local_views = {}
     self.title_map = {}
     self.title2slug = {}
     self.failured_map = {}
     self.zhihu_views = {}
     self.zhihu_id = {}
     self.jianshu_views = {}
     self.jianshu_id = {}
     self.csdn_views = {}
     self.csdn_id = {}
     self.exist_data = {}
     self.getTitleMap()
     self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s'''
     self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s'''
     self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''
Пример #8
0
def db(ctrl, queue):
    db = Db(DB_URI, mode='rwc', timeout=DB_TIMEOUT)
    notify('DB', 'up')
    stats = {}

    def _commit(cache):
        db.write(cache)

    def _check(path, url):
        if not db.exists(path):
            ctrl.put((LOAD, path, url))
        else:
            ctrl.put((DISCARD, path, url))

    commands = {
        CHECK: _check,
        COMMIT: _commit,
    }

    def _run():
        cmd, *args = queue.get()
        commands[cmd](*args)

    return worker(_run, "db", stats)
Пример #9
0
import os
import sys
import csv
import time

from collections import defaultdict

from utils.db import Db
from config.constants import *

if __name__ == "__main__":
    manifest = os.path.join(EXTRACT_DIR, EXTRACT_MANIFEST)
    db = Db(DB_URI, timeout=DB_TIMEOUT)

    stats = {
        'date': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
        'by-year':
        defaultdict(lambda: dict(count=0, filename=None, file=None)),
    }

    def writer():
        def _writer(year):
            entry = stats['by-year'][year]
            entry['filename'] = filename = str(year) + '.csv'

            filepath = os.path.join(EXTRACT_DIR, filename)
            entry['file'] = file = open(filepath, 'wt')
            output = csv.writer(file)

            def _write(row):
                date, *tail = row
Пример #10
0
class GetFreeProxy:
    """
    proxy pool
    """
    def __init__(self):
        self.Db = Db("netease")
        self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s '''
        self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0'''
        self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s '''
        self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s'''
        self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s'''
        self.canuseip = {}
        self.waitjudge = []
        self.cannotuseip = {}
        self.failuredtime = {}
        self.initproxy()

    def get_request_proxy(self,
                          url: str,
                          types: int,
                          data=None,
                          test_func=None,
                          header=None):
        """
        use proxy to send requests, and record the proxy cann't use
        @types S0XY: X=0.->get;   =1.->post;
                     Y=0.->html;  =1.->json; =2.->basic
                     S=0.->basic ;=1.->ss

        support failured retry && failured auto record
        """

        httptype = url[4] == 's'
        ss_type = types // 1000
        types %= 1000
        if ss_type:
            proxylist = self.proxylists_ss if httptype else self.proxylist_ss
        else:
            proxylist = self.proxylists if httptype else self.proxylist

        if not len(proxylist):
            if self.Db.db:
                print(
                    'Proxy pool empty!!! Please check the db conn & db dataset!!!'
                )
            proxies = {}
        else:
            index = random.randint(0, len(proxylist) - 1)
            proxies_url = proxylist[index]
            proxies = {type_map[httptype]: proxies_url}

        try:
            result = basic_req(url, types, proxies, data, header)
            if not test_func is None:
                if not test_func(result):
                    if self.check_retry(url):
                        self.get_request_proxy(url, types + 1000 * ss_type,
                                               data, test_func)
                    else:
                        self.failuredtime[url] = 0
                        return
                else:
                    return result
            else:
                return result

        except:
            self.cannotuseip[random.randint(0, MAXN)] = proxies_url

            if proxies_url in proxylist:
                proxylist.remove(proxylist.index(proxies_url))

            if not len(self.cannotuseip.keys()) % 10:
                self.cleancannotuse()

            if self.check_retry(url):
                self.get_request_proxy(url, types + 1000 * ss_type, data,
                                       test_func)
            else:
                return

    def check_retry(self, url):
        """
        check cannt retry
        """
        if url not in self.failuredtime:
            self.failuredtime[url] = 0
            return True
        elif self.failuredtime[url] < 3:
            self.failuredtime[url] += 1
            return True
        else:
            self.log_write(url)
            self.failuredtime[url] = 0
            return False

    def log_write(self, url):
        """
        failure log
        """
        with codecs.open("proxy.log", 'a', encoding='utf-8') as f:
            f.write(time_str() + url + '\n')

    def insertproxy(self, insertlist):
        """
        insert data to db
        """
        results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1])
        if results:
            print('Insert ' + str(len(insertlist)) + ' items Success!')
        else:
            pass

    def updateproxy(self, updatelist, types):
        """
        update data to db
        """

        results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1])
        typemap = {0: 'can use ', 1: 'can not use '}
        if results:
            print('Update', typemap[types], str(len(updatelist)),
                  ' items Success!')
        else:
            pass

    def selectproxy(self, targetlist):
        """
        select ip proxy by ids
        """
        if not len(targetlist):
            return []
        elif len(targetlist) == 1:
            waitlist = '(\'' + targetlist[0] + '\')'
        else:
            waitlist = tuple(targetlist)
        return self.Db.select_db(self.select_sql % str(waitlist))

    def dbcanuseproxy(self):
        """
        test db have or not this data
        """

        results = self.selectproxy([ii[0] for ii in self.canuseip.values()])
        ss_len = len([1 for ii in self.canuseip.values() if ii[1] > 1])
        print("SS proxies %d" % ss_len)

        insertlist = []
        updatelist = []
        ipmap = {}
        if results != False:
            for ip_info in results:
                ipmap[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.canuseip.values():
                http_type = ip_now[1]
                ip_now = ip_now[0]
                if ip_now in ipmap:
                    if ipmap[ip_now][1]:
                        updatelist.append(
                            (ipmap[ip_now][0], ip_now, http_type, 0))
                else:
                    insertlist.append((ip_now, http_type))
            if len(insertlist):
                self.insertproxy(insertlist)
            if len(updatelist):
                self.updateproxy(updatelist, 0)
        else:
            pass
        self.canuseip = {}

    def cleancannotuse(self):
        """
        update db proxy cann't use
        """
        results = self.selectproxy(self.cannotuseip.values())
        updatelist = []
        ipmap = {}
        if results:
            for ip_info in results:
                ipmap[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.cannotuseip.values():
                http_type = ip_now[4] == 's'
                if ip_now in ipmap:
                    updatelist.append((ipmap[ip_now][0], ip_now, http_type,
                                       ipmap[ip_now][1] + 1))

            if len(updatelist):
                self.updateproxy(updatelist, 1)
        else:
            pass
        self.cannotuseip = {}

    def initproxy(self):
        """
        init proxy list
        """

        results = self.Db.select_db(self.select_list)
        self.proxylist = []
        self.proxylists = []
        self.proxylist_ss = []
        self.proxylists_ss = []
        if results != 0:

            for index in results:
                if index[1] == 1:
                    self.proxylists.append(index[0])
                elif index[1] == 2:
                    self.proxylist.append(index[0])
                    self.proxylist_ss.append(index[0])
                elif index[1] == 3:
                    self.proxylists.append(index[0])
                    self.proxylists_ss.append(index[0])
                else:
                    self.proxylist.append(index[0])
            print(len(self.proxylist), ' http proxy can use.')
            print(len(self.proxylists), ' https proxy can use.')
            print(len(self.proxylist_ss), ' ss http proxy can use.')
            print(len(self.proxylists_ss), ' ss https proxy can use.')
        else:
            print(
                '>>>Please check db configure!!! The proxy pool cant use!!!>>>'
            )

    def judgeurl(self, urls, index, times):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == 's'
        proxies = {type_map[http_type]: urls}

        test_url = type_map[
            http_type] + '://music.163.com/api/playlist/detail?id=432853362'
        ss_url = 'https://www.google.com/?gws_rd=ssl'
        try:
            # print(test_url, proxies)
            # return
            data = basic_req(test_url, 1, proxies)
            result = data['result']
            tracks = result['tracks']
            if len(tracks) == 56:
                if times < 2:
                    self.judgeurl(urls, index, times + 1)
                else:
                    self.canuseip[index] = [urls, int(http_type)]
                    data = basic_req(ss_url, 0)
                    if len(str(data)) > 5000:
                        self.canuseip[index] = [urls, int(http_type) + 2]
            else:
                self.cannotuseip[index] = urls
        except:
            if not index in self.canuseip:
                self.cannotuseip[index] = urls
            pass

    def threadjude(self):
        """
        threading to judge proxy
        """
        changeJsonTimeout(2)
        changeHtmlTimeout(3)

        text = self.waitjudge
        num = len(text)
        for block in range(num // 1000 + 1):
            blockthreads = []
            for index in range(block * 1000, min(num, 1000 * (block + 1))):
                work = threading.Thread(target=self.judgeurl,
                                        args=(
                                            text[index],
                                            index,
                                            0,
                                        ))
                blockthreads.append(work)
            for work in blockthreads:
                work.start()
            for work in blockthreads:
                work.join()
            # return
            self.dbcanuseproxy()
            self.cleancannotuse()

        self.waitjudge = []

    def testdb(self, types):
        '''
        test proxy in db can use
        '''

        version = begin_time()
        typestr = ''
        if types == 2:
            typestr = '(0,1,2,3)'
        elif types == 1:
            typestr = '(1,3)'
        else:
            typestr = '(0,2)'
        results = self.Db.select_db(self.select_all % typestr)
        if results != 0:
            for index in results:
                self.waitjudge.append(index[0])
            self.threadjude()
        else:
            pass
        self.initproxy()
        end_time(version)

    def xiciproxy(self, page):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            print("Please input num!")
            return []

        version = begin_time()
        url = 'http://www.xicidaili.com/nn/%d'
        for index in range(1, page + 1):
            html = basic_req(url % (index), 0)
            tem = html.find_all('tr')
            for index in range(1, len(tem)):
                tds = tem[index].find_all('td')
                ip = tds[5].text.lower()
                self.waitjudge.append(ip + '://' + tds[1].text + ':' +
                                      tds[2].text)
        self.threadjude()
        end_time(version)

    def gatherproxy(self, types):
        """
        :100: very nice website
        first of all you should download proxy ip txt from:
        http://www.gatherproxy.com/zh/proxylist/country/?c=China
        """
        version = begin_time()
        if not os.path.exists('%sgatherproxy' % data_path):
            print('Gather file not exist!!!')
            return
        with codecs.open('%sgatherproxy' % data_path, 'r',
                         encoding='utf-8') as f:
            file_d = [ii.strip() for ii in f.readlines()]
        if not types:
            waitjudge = ['http://' + ii[:-1] for ii in file_d]
        elif types == 1:
            waitjudge = ['https://' + ii[:-1] for ii in file_d]
        else:
            waitjudge1 = ['http://' + ii[:-1] for ii in file_d]
            waitjudge2 = ['https://' + ii[:-1] for ii in file_d]
            waitjudge = [*waitjudge1, *waitjudge2]
        self.waitjudge = waitjudge
        print('load gather over!')
        end_time(version)

    def goubanjia(self):
        """
        :-1: html tag mixed with invalid data
        :100:And the most important thing is the port writed in 'class' rather in text.
        The website is difficult to spider, but the proxys are very goog
        goubanjia proxy http://www.goubanjia.com
        """

        version = begin_time()
        host = 'http://www.goubanjia.com'
        html = self.get_request_proxy(host, 0)

        if not html:
            return []
        trs = html.find_all('tr', class_=['warning', 'success'])
        for tr in trs:
            tds = tr.find_all('td')
            ip = tds[2].find_all('a')[0].text + '://'
            iplist = tds[0].find_all(['div', 'span', not 'p'],
                                     class_=not 'port')
            for index in iplist:
                ip += index.text
            encode = tds[0].find_all(['div', 'span', 'p'],
                                     class_='port')[0]['class'][1]
            uncode = functools.reduce(
                lambda x, y: x * 10 + (ord(y) - ord('A')),
                map(lambda x: x, encode), 0)
            self.waitjudge.append(ip + ':' + str(int(uncode / 8)))
        self.threadjude()
        end_time(version)

    def schedulegou(self):
        sched = BlockingScheduler()
        sched.add_job(self.goubanjia, 'interval', seconds=100)
        sched.start()

    def data5u(self):
        """
        data5u proxy http://www.data5u.com/
        no one can use
        """

        version = begin_time()
        url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml']
        host = 'http://www.data5u.com/'
        for uri in url_list:
            html = self.get_request_proxy(host + uri, 0)
            if not html:
                continue
            table = html.find_all('ul', class_='l2')
            for index in table:
                tds = index.find_all('li')
                ip = tds[3].text
                self.waitjudge.append(ip + '://' + tds[0].text + ':' +
                                      tds[1].text)
        self.threadjude()
        end_time(version)

    def sixsixip(self, area, page):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        version = begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                print(str(index) + ' ' + str(pageindex))
                work = threading.Thread(target=self.sixsixthread,
                                        args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)

    def sixsixthread(self, index, pageindex):
        host = '''http://www.66ip.cn/areaindex_%d/%d.html'''
        html = self.get_request_proxy(host % (index, pageindex), 0)
        if not html:
            return []
        trs = html.find_all('table')[2].find_all('tr')
        for test in range(1, len(trs) - 1):
            tds = trs[test].find_all('td')
            self.waitjudge.append('http://' + tds[0].text + ':' + tds[1].text)
            self.waitjudge.append('https://' + tds[0].text + ':' + tds[1].text)

    def kuaidaili(self, page):
        """
        kuaidaili https://www.kuaidaili.com/free/
        """

        version = begin_time()
        threadings = []
        for index in range(1, page + 1):
            work = threading.Thread(target=self.kuaidailithread,
                                    args=(index, ))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)

    def kuaidailithread(self, index):
        host = '''https://www.kuaidaili.com/free/inha/%d/'''
        html = self.get_request_proxy(host % index, 0)
        if not html:
            return []
        trs = html.find_all('tr')
        for index in range(1, len(trs)):
            tds = trs[index].find_all('td')
            ip = tds[3].text.lower() + "://" + tds[0].text + ':' + tds[1].text
            self.waitjudge.append(ip)

    def get_cookie(self):
        """
        make cookie login
        PS: Though cookie expired time is more than 1 year,
            but It will be break when the connect close.
            So you need reactive the cookie by this function.
        """
        headers = {
            'pragma':
            'no-cache',
            'cache-control':
            'no-cache',
            'Host':
            'www.gatherproxy.com',
            'Origin':
            'http://www.gatherproxy.com',
            'Referer':
            'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent',
            'Cookie':
            '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }
        login_url = 'http://www.gatherproxy.com/subscribe/login'

        cookie_html = basic_req(login_url, 0, header=headers)
        verify_text = cookie_html.find_all('div', class_='label')[2].span.text
        verify_list = verify_text.replace('= ', '').strip().split()
        num_map = {
            'Zero': 0,
            'One': 1,
            'Two': 2,
            'Three': 3,
            'Four': 4,
            'Fine': 5,
            'Six': 6,
            'Seven': 7,
            'Eight': 8,
            'Nine': 9,
            'Ten': 10
        }
        verify_num = [verify_list[0], verify_list[2]]
        for index, num in enumerate(verify_num):
            if num.isdigit():
                verify_num[index] = int(num)
            elif num in num_map:
                verify_num[index] = num_map[num]
            else:
                print('Error', index)
                # return False
        verify_code = 0
        error = True

        operation = verify_list[1]
        if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied':
            verify_code = verify_num[0] + verify_num[1]
            error = False
        if operation == '-' or operation == 'minus':
            verify_code = verify_num[0] - verify_num[1]
            error = False
        if operation == 'X' or operation == 'multiplication':
            verify_code = verify_num[0] * verify_num[1]
            error = False
        if error:
            print('Error', operation)
            # return False
        if not os.path.exists('%spassage' % data_path):
            print('gather passage not exist!!!')
            return
        with codecs.open('%spassage' % data_path, 'r', encoding='utf-8') as f:
            passage = [index[:-1] for index in f.readlines()]
        data = {
            'Username': passage[0],
            'Password': passage[1],
            'Captcha': str(verify_code)
        }
        time.sleep(2.163)
        r = requests.session()
        r.cookies = cj.LWPCookieJar()
        login_req = r.post(login_url, headers=headers, data=data, verify=False)

    def load_gather(self):
        """
        load gather proxy pool text
        If failured, you should reactive the cookie.
        """
        headers = {
            'pragma':
            'no-cache',
            'cache-control':
            'no-cache',
            'Host':
            'www.gatherproxy.com',
            'Origin':
            'http://www.gatherproxy.com',
            'Referer':
            'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent',
            'Cookie':
            '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }
        url = 'http://www.gatherproxy.com/subscribe/infos'
        sid_url_req = requests.get(url, headers=headers, verify=False)
        sid_url_html = BeautifulSoup(sid_url_req.text, 'html.parser')
        sid_url = sid_url_html.find_all(
            'div', class_='wrapper')[1].find_all('a')[0]['href']
        if len(sid_url.split('sid=')) < 2:
            print('cookie error')
            self.get_cookie()
            self.load_gather()
            return
        sid = sid_url.split('sid=')[1]
        sid_url = 'http://www.gatherproxy.com' + sid_url

        data = {'ID': sid, 'C': '', 'P': '', 'T': '', 'U': '0'}
        gatherproxy = requests.post(sid_url,
                                    headers=headers,
                                    data=data,
                                    verify=False)
        with codecs.open(data_path + 'gatherproxy', 'w',
                         encoding='utf-8') as f:
            f.write(gatherproxy.text)
Пример #11
0
 def setUp(self):
     self.db_to_test = Db()
     pass
Пример #12
0
 def __init__(self):
     Db.__init__(self)
     self.table = "subs_schedule"
     self.connect()
Пример #13
0
class Get_playlist_song():
    """
    1. get playlist id from classify;
    2. get song from play list;
    use url:
    """
    def __init__(self):
        self.Db = Db()
        self.classifylist = {}
        self.proxyclass = GetFreeProxy()
        self.playlists = []
        self.failuredmap = {}
        self.songmap = {}
        self.songlist = []
        self.finishlist = []
        self.get_classify()
        self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' '''
        self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 '''
        self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' '''
        self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s'''
        self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)'''  # change to your file absolute address
        self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s'''
        self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''

    def get_classify(self):
        """
        get classify from /discover/playlist
        """

        begin_time()
        self.classifylist = {}
        host = 'https://music.163.com/discover/playlist'
        html = self.proxyclass.get_request_proxy(host, host[8:21], 0)

        if not html:
            print('Empty')
            self.proxyclass.cleancannotuse()
            if self.can_retry(host):
                self.get_classify()
            return []

        alist = html.find_all('a', class_='s-fc1')
        if not len(alist):
            if self.can_retry(host):
                self.get_classify()
            print(html)
        for index in alist:
            self.classifylist[index.text] = index['href']
        self.proxyclass.cleancannotuse()
        end_time()

    def get_playlist_id(self, classify, offset):
        """
        get playlist id from classify
        """

        host = 'https://music.163.com'
        allclassify = classify == '全部风格'
        url = host + self.classifylist[classify] + (
            '?' if allclassify else
            '&') + 'order=hot&limit=35&offset=' + str(offset)
        # html = self.proxyclass.get_request_proxy(url, host[8:], 0)
        html = get_html(url, {}, host[8:])

        if not html:
            if self.can_retry(url):
                self.get_playlist_id(classify, offset)
            else:
                self.proxyclass.log_write(url)
            return []
        alist = html.find_all('a', class_='icon-play')
        if not len(alist):
            if self.can_retry(url):
                self.get_playlist_id(classify, offset)
            else:
                self.proxyclass.log_write(url)
        for index in alist:
            self.playlists.append(index['data-res-id'])

    def can_retry(self, url):
        """
        judge can retry once
        """

        if url not in self.failuredmap:
            self.failuredmap[url] = 0
            # print("Retry " + str(self.failuredmap[url]) + ' ' + url)
            return True
        elif self.failuredmap[url] < 2:
            self.failuredmap[url] += 1
            # print("Retry " + str(self.failuredmap[url]) + ' ' + url)
            return True
        else:
            print("Failured " + url)
            self.proxyclass.log_write(url)
            self.failuredmap[url] = 0
            return False

    def get_playlist_id_thread(self):
        """
        get play list id in threading
        """

        begin_time()
        if not len(self.classifylist):
            self.get_classify()

        for index in self.classifylist:
            threadings = []
            for offset in range(41):
                work = threading.Thread(target=self.get_playlist_id,
                                        args=(
                                            index,
                                            offset * 35,
                                        ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()
            self.proxyclass.cleancannotuse()
            print(len(self.playlists))
            self.test_queue(index)
            self.playlists = []
            print(index + " Over")
        end_time()

    def test_queue(self, classify):
        """
        test data if in playlist_queue
        """
        if len(self.playlists) == 1:
            waitlist = '(' + str(self.playlists[0]) + ')'
        else:
            waitlist = tuple(self.playlists)
        results = self.Db.select_db(self.select_one %
                                    (str(waitlist), classify))
        if not results:
            return []
        hadexist = []
        for index in results:
            hadexist.append(index[0])
        insertlist = []
        for index in self.playlists:
            if index not in hadexist:
                # file_d.write(str([index, classify])[1:-1] + '\n')
                insertlist.append((index, classify))
        print('Insert ' + str(len(insertlist)) + ' ' + classify)
        self.insert_queue(insertlist)

    def insert_queue(self, ids):
        """
        insert data to playlist_queue
        """

        if not len(ids):
            return []
        results = self.Db.insert_db(self.insert_sql % str(ids)[1:-1])
        if results:
            if len(ids):
                print('Insert ' + ids[0][1] + ' ' + str(len(ids)) +
                      ' Success!')
        else:
            pass

    def get_list_ids(self, classify):
        """
        get list ids from db
        """
        results = self.Db.select_db(self.select_ids % classify)
        ids = []
        if results:
            for index in results:
                ids.append([index[0], index[1]])
        return ids

    def get_song_detail_thread(self):
        """
        get song detail threadings
        """

        begin_time()
        for classify in self.classifylist:
            ids = self.get_list_ids(classify)
            threadings = []
            for oneid in ids:
                work = threading.Thread(target=self.get_song_detail,
                                        args=(oneid[1], ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()
            self.clean_data()
            self.test_song(classify, ids)
            self.songlist = []
            self.songmap = {}
            self.finishlist = []
            self.successtime = 0
            print(classify + ' Over!')
        end_time()

    def clean_data(self):
        """
        aggregation data
        """
        for song in self.songlist:
            [songid, songname, playcount] = song
            if songid not in self.songmap:
                self.songmap[songid] = [1, playcount, songname]
            else:
                orgin = self.songmap[songid]
                self.songmap[songid] = [
                    orgin[0] + 1, orgin[1] + playcount, songname
                ]

    def get_song_detail(self, id):
        """
        get song detail form playlist
        """

        host = 'http://music.163.com/api/playlist/detail?id=' + str(id)
        json = self.proxyclass.get_request_proxy(host, host[7:20], 1)
        if json == 0:
            if self.can_retry(host):
                self.get_song_detail(id)
            else:
                self.proxyclass.log_write(host)
            return []
        result = json['result']
        tracks = result['tracks']

        if len(tracks) <= 1:
            if self.can_retry(host):
                self.get_song_detail(id)
            else:
                self.proxyclass.log_write(host)
                return []
        else:
            playcount = result['playCount']
            for track in tracks:
                songid = track['id']
                songname = track['name']
                self.songlist.append([songid, songname, playcount])
            self.finishlist.append(id)

    def test_song(self, classify, ids):
        """
        test song if in db
        """
        songs = []
        for song in self.songmap:
            songs.append(song)
        if not len(songs):
            return []
        elif len(songs) == 1:
            waitlist = '(' + songs[0] + ')'
        else:
            waitlist = tuple(songs)
        results = self.Db.select_db(self.select_song %
                                    (str(waitlist), classify))
        resultmap = {}
        for detail in results:
            resultmap[detail[1]] = [detail[0], detail[2], detail[3]]

        replacelist = []
        insertlist = []
        replacequeue = []
        file_d = codecs.open("song_detail", 'a', encoding='utf-8')
        file_d.seek(0)
        file_d.truncate()
        idsmap = {}
        for indexid in ids:
            idsmap[indexid[1]] = indexid[0]
        for song in self.songmap:
            songdetail = self.songmap[song]
            if song in resultmap:
                dbdetail = resultmap[song]
                replacelist.append(
                    (dbdetail[0], song, classify, songdetail[2],
                     songdetail[0] + dbdetail[1], songdetail[1] + dbdetail[2]))
            else:
                file_d.write(u'' + str([
                    song, u'' + str(u'' + songdetail[2].replace(',', ' '))
                    [0:20], classify, songdetail[0], songdetail[1]
                ])[1:-1] + '\n')
                insertlist.append((song, songdetail[2], classify,
                                   songdetail[0], songdetail[1]))
        for playlist in self.finishlist:
            replacequeue.append((idsmap[playlist], playlist, classify, 1))
        file_d.close()
        if len(insertlist):
            self.db_song_detail(insertlist, 'Insert', replacequeue)
        if len(replacelist):
            self.db_song_detail(replacelist, 'Update', [])

    def db_song_detail(self, waitlist, types, replacequeue):
        """
        batch insert/update song detail
        """

        if types == 'Update':
            results = self.Db.update_db(self.replace_song %
                                        str(blocklist)[1:-1])
        else:
            results = self.Db.update_db(self.insert_song)
        if results:
            if len(waitlist):
                print(types + ' song detail for ' + waitlist[0][2] + ' ' +
                      str(len(waitlist)) + ' Success!')
            if types == 'Insert':
                self.replace_queue_db(replacequeue)

    def replace_queue_db(self, replacequeue):
        """
        replace db for fininsh playlist id
        """

        results = self.Db.update_db(self.replace_queue %
                                    str(replacequeue)[1:-1])
        if results:
            if len(replacequeue):
                print('Update queue fininsh for ' + str(len(replacequeue)) +
                      ' item!')
        else:
            pass
Пример #14
0
 def __init__(self):
     cmd.Cmd.__init__(self)
     self.prompt = "> "
     self.db = Db(
     )  # i don't feel great about how this db is created and passed around
Пример #15
0
class find_location(object):
    """
    find location
    """
    def __init__(self):
        self.Db = Db("china_regions")
        china = pd.read_csv('news/china_city_list.csv', encoding='gbk')
        self.province = list(china.groupby(by=['Province']).count().axes[0])
        self.city = list(china.groupby(by=['City']).count().axes[0])
        self.filelists = [
            'google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail',
            'bjh_detail_poison', 'news_steal.txt', 'news_poison.txt'
        ]
        self.city_province = {}
        self.province_map = {}

        self.pre_data()
        for index, row in china.iterrows():
            self.city_province[row['City']] = row['Province']

    def search_location(self):
        word = ''
        count = 0
        for file in self.filelists:
            temp_word_list = codecs.open(file, 'r',
                                         encoding='utf-8').readlines()
            count += len(temp_word_list)
            word += " ".join(temp_word_list)
        # return word
        print(count)
        word_province = {}
        word_city = {}
        word_city_pro = {}
        for index in self.province:
            temp_num = word.count(index)
            if temp_num:
                word_province[index] = temp_num
        for index in self.city:
            temp_num = word.count(index)
            if temp_num:
                word_city[index] = temp_num
        for index in word_city:
            province = self.city_province[index]
            if province in word_city_pro:
                word_city_pro[province] += word_city[index]
            else:
                word_city_pro[province] = word_city[index]
        print(sum(word_province.values()), sum(word_city.values()),
              sum(word_city_pro.values()))
        return word_province, word_city, word_city_pro

    def participles_word(self):
        """
        participles word
        """
        version = begin_time()

        for file in self.filelists:
            pkuseg.test(file,
                        file[:-4] + '_pkuseg.txt',
                        model_name='../Model_retrieval/pkuseg',
                        nthread=20)
        end_time(version)

    def pre_data(self):
        """
        load city key-value from mysql
        """
        province = self.Db.select_db(
            'select * from china_regions where level=1')
        self.province_map = {
            int(index[2]): index[3][:3]
            if len(index[3]) == 4 or len(index[3]) == 6 else index[3][:2]
            for index in province
        }

        city = self.Db.select_db('select * from china_regions where level=2')
        city_state = [index for index in city if index[3][-1:] == '州']
        seg = pkuseg.pkuseg()
        city_state = {
            seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else
            seg.cut(index[3])[0] + seg.cut(index[3])[1]: int(index[1])
            for index in city if index[3][-1:] == '州'
        }
        seg = pkuseg.pkuseg(model_name='../Model_retrieval/pkuseg')
        city_state1 = {
            seg.cut(index)[0] if len(seg.cut(index)[0]) > 1 else
            seg.cut(index)[0] + seg.cut(index)[1]: city_state[index]
            for index in city_state
        }
        city_area = {
            index[3][:-2]: int(index[1])
            for index in city if '地区' in index[3]
        }
        city_other = {
            index[3][:-1]: int(index[1])
            for index in city if index[3][-1:] == '市' or index[3][-1:] == '盟'
        }
        self.city_province = {**city_state1, **city_area, **city_other}
        self.city_province = {
            index: self.province_map[self.city_province[index]]
            for index in self.city_province
        }
        county = self.Db.select_db('select * from china_regions where level=3')
        county_area_pre = {index for index in county if index[3][-1] == '区'}
        county_area_two = {
            index[3][:-2]: int(index[1][:2])
            for index in county_area_pre if len(index[3]) > 3 and (
                index[3][-2] == '矿' or index[3][-2] == '林')
        }
        # print('芒' in county_area_two, 'two')
        county_area_state = {
            seg.cut(index[3][:-2])[0]: int(index[1][:2])
            for index in county_area_pre
            if len(index[3]) > 2 and index[3][-2] == '族'
        }
        # print('芒' in county_area_state, 'state')
        county_area_other = {
            index[3][:-1]: int(index[1][:2])
            for index in county_area_pre
            if len(index[3]) > 2 and index[3][-2] != '族'
            and index[3][-2] != '林' and index[3][-2] != '矿'
        }
        # print('芒' in county_area_other, 'other')
        county_county_pre = {index for index in county if index[3][-1] == '县'}
        county_county_two = {
            index[3]: int(index[1][:2])
            for index in county_county_pre if len(index[3]) == 2
        }
        # print('芒' in county_county_two, 'two')
        seg = pkuseg.pkuseg()
        county_county_state = {
            seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else
            seg.cut(index[3])[0] + seg.cut(index[3])[1]: int(index[1][:2])
            for index in county_county_pre
            if len(index[3]) > 2 and index[3][-3:-1] == '自治'
        }
        county_county_state = {
            index[:-2] if '族' in index and len(index) > 3 else index:
            county_county_state[index]
            for index in county_county_state
        }
        # print('芒' in county_county_state, 'state')
        county_county_other = {
            index[3][:-1]: int(index[1][:2])
            for index in county_county_pre
            if index[3][-3:-1] != '自治' and len(index[3]) > 2
        }
        # print('芒' in county_county_other, 'other')
        county_city = {
            index[3][:-1] if len(index[3]) > 2 else index[3]: int(index[1][:2])
            for index in county if index[3][-1] == '市'
        }
        # print('芒' in county_city, 'city')
        county_domain = {
            index[3][:4]: int(index[1][:2])
            for index in county if index[3][-1] == '域'
        }
        # print('芒' in county_domain, 'domain')
        county_other = {
            index[3]: int(index[1][:2])
            for index in county if index[3][-1] == '盟' or index[3][-1] == '岛'
        }
        # print('芒' in county_other, 'other')
        county_province = {
            **county_area_two,
            **county_area_state,
            **county_area_other,
            **county_county_two,
            **county_county_state,
            **county_county_other,
            **county_city,
            **county_domain,
            **county_other
        }
        county_province = {
            index: self.province_map[county_province[index]]
            for index in county_province
        }
        self.city_province = {**self.city_province, **county_province}
        print({index for index in self.city_province if len(index) == 1})

    def test_province(self, maps, words):
        word_city = {}
        for index in maps:
            temp_num = words.count(index)
            province = maps[index]
            if temp_num:
                if province in word_city:
                    word_city[province] += temp_num
                else:
                    word_city[province] = temp_num
        print(sum(word_city.values()))
        return word_city
Пример #16
0
import pytz
import pandas as pd
from utils.db import Db, get_prices

# connect to Db
_ = Db(host="localhost", user="******", password="******", db="go_finance")

def load(symbols, start, end, is_adj=True):
    data = dict()

    # загружаем цены
    r = get_prices(symbols=symbols, dt_from=end, period=(end - start).days, is_adj=is_adj)

    for symbol in symbols:
        symbol_data = r['symbol'] == symbol
        data[symbol] = pd.DataFrame({
            'open': r['open'][symbol_data],
            'high': r['high'][symbol_data],
            'low': r['low'][symbol_data],
            'close': r['close'][symbol_data],
            'volume': r['volume'][symbol_data],
        }, index=r['dt'][symbol_data])

    panel = pd.Panel(data)

    panel.major_axis = panel.major_axis.tz_localize(pytz.utc)

    return panel
Пример #17
0
 def __init__(self):
     Db.__init__(self)
     self.table = 'credentials'
     self.connect()
Пример #18
0
class GetFreeProxy(object):
    """
    proxy getter
    """
    def __init__(self):
        self.Db = Db()
        self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s '''
        self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0'''
        self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s '''
        self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5'''
        self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s'''
        self.typemap = {1: 'https', 0: 'http'}
        self.canuseip = []
        self.waitjudge = []
        self.proxylist = []
        self.proxylists = []
        self.cannotuseip = []
        self.failuredtime = {}
        self.initproxy()

    def get_request_proxy(self, url, host, types):
        """
        use proxy to send requests, and record the proxy cann't use
        @types 1:json, 0:html
        support failured retry
        """

        if not len(self.proxylist):
            self.initproxy()

        httptype = url[4] == 's'
        index = random.randint(
            0,
            len(self.proxylists if httptype else self.proxylist) - 1)
        if httptype:
            proxies = {'https': self.proxylists[index]}
        else:
            proxies = {'http': self.proxylist[index]}

        try:
            if types:
                json = get_json(url, proxies, host)
                if 'code' in json and json['code'] != 200:
                    ppap = self.retry(url, host, types)
                    if not ppap:
                        return False
                else:
                    return json
            else:
                html = get_html(url, proxies, host)
                if 'code' in html or not html:
                    ppap = self.retry(url, host, types)
                    if not ppap:
                        return False
                else:
                    return html
        except Exception as e:
            self.cannotuseip.append(proxies[self.typemap[httptype]])
            if httptype:
                if index < len(self.proxylists) and proxies[
                        'https'] == self.proxylists[index]:
                    self.proxylists.remove(proxies['https'])
            else:
                if index < len(self.proxylist
                               ) and proxies['http'] == self.proxylist[index]:
                    self.proxylist.remove(proxies['http'])
            ppap = self.retry(url, host, types)
            if not ppap:
                return False

    def retry(self, url, host, types):
        """
        retry once
        """

        if url not in self.failuredtime:
            self.failuredtime[url] = 0
            # print("retry " + str(self.failuredtime[url]))
            self.get_request_proxy(url, host, types)
        elif self.failuredtime[url] < 3:
            self.failuredtime[url] += 1
            # print("retry " + str(self.failuredtime[url]))
            self.get_request_proxy(url, host, types)
        else:
            # print("Request Failured three times!")
            self.log_write(url)
            self.failuredtime[url] = 0
            return False

    def log_write(self, url):
        """
        failure log
        """

        file_d = open("log", 'a')
        file_d.write(
            time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()) + url + '\n')
        file_d.close()

    def insertproxy(self, insertlist):
        """
        insert data to db
        """
        results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1])
        if results:
            print('Insert ' + str(len(insertlist)) + ' items Success!')
        else:
            pass

    def updateproxy(self, updatelist, types):
        """
        update data to db
        """

        results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1])
        typemap = {0: 'can use ', 1: 'can not use '}
        if results:
            print('Update ' + typemap[types] + str(len(updatelist)) +
                  ' items Success!')
        else:
            pass

    def selectproxy(self, targetlist):
        """
        select ip proxy by ids
        """
        if not len(targetlist):
            return []
        elif len(targetlist) == 1:
            waitlist = '(\'' + targetlist[0] + '\')'
        else:
            waitlist = tuple(targetlist)
        return self.Db.select_db(self.select_sql % str(waitlist))

    def dbcanuseproxy(self):
        """
        test db have or not this data
        """

        results = self.selectproxy(self.canuseip)

        insertlist = []
        updatelist = []
        ipmap = {}
        if results:
            for ip_info in results:
                ipmap[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.canuseip:
                http_type = ip_now[4] == 's'
                if ip_now in ipmap:
                    if ipmap[ip_now][1]:

                        updatelist.append(
                            (ipmap[ip_now][0], ip_now, http_type, 0))
                else:
                    insertlist.append((ip_now, http_type))
            if len(insertlist):
                self.insertproxy(insertlist)
            if len(updatelist):
                self.updateproxy(updatelist, 0)
        else:
            pass
        self.canuseip = []

    def cleancannotuse(self):
        """
        update db proxy cann't use
        """
        results = self.selectproxy(self.cannotuseip)
        updatelist = []
        ipmap = {}
        if results:
            for ip_info in results:
                ipmap[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.cannotuseip:
                http_type = ip_now[4] == 's'
                if ip_now in ipmap:
                    updatelist.append((ipmap[ip_now][0], ip_now, http_type,
                                       ipmap[ip_now][1] + 1))

            if len(updatelist):
                self.updateproxy(updatelist, 1)
        else:
            pass
        self.cannotuseip = []

    def initproxy(self):
        """
        init proxy list
        """

        results = self.Db.select_db(self.select_list)
        if results != 0:
            self.proxylist = []
            self.proxylists = []
            for index in results:
                if index[1]:
                    self.proxylists.append(index[0])
                else:
                    self.proxylist.append(index[0])
            print(str(len(self.proxylist)) + ' http proxy can use.')
            print(str(len(self.proxylists)) + ' https proxy can use.')
        else:
            pass

    def judgeurl(self, urls, times):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == 's'
        proxies = {self.typemap[http_type]: urls}

        test_url = 'https://music.163.com/discover/playlist/?order=hot&limit=35&offset=0' if http_type else 'http://music.163.com/api/playlist/detail?id=432853362'
        if http_type:
            try:
                html = get_html(test_url, proxies, test_url[8:21])
                alist = html.find_all('a', class_='s-fc1')
                if len(alist) == 73:
                    self.canuseip.append(urls)
                else:
                    self.cannotuseip.append(urls)
            except Exception as e:
                self.cannotuseip.append(urls)
                pass
        else:
            try:
                data = get_json(test_url, proxies, test_url[7:20])
                result = data['result']
                tracks = result['tracks']
                if len(tracks) == 56:
                    if times < 2:
                        self.judgeurl(urls, times + 1)
                    else:
                        self.canuseip.append(urls)
                else:
                    self.cannotuseip.append(urls)
            except Exception as e:
                self.cannotuseip.append(urls)
                pass

    def threadjude(self):
        """
        threading to judge proxy
        """

        text = self.waitjudge
        num = len(text)
        for block in range(int(num / 1000) + 1):
            blockthreads = []
            for index in range(block * 1000, min(num, 1000 * (block + 1))):
                work = threading.Thread(target=self.judgeurl,
                                        args=(
                                            text[index],
                                            0,
                                        ))
                blockthreads.append(work)
            for work in blockthreads:
                work.start()
            for work in blockthreads:
                work.join()
            self.dbcanuseproxy()
            self.cleancannotuse()
        self.waitjudge = []

    def testdb(self):
        '''
        test proxy in db can use
        '''

        begin_time()
        results = self.Db.select_db(self.select_all)
        if results != 0:
            for index in results:
                self.waitjudge.append(index[0])
            self.threadjude()
        else:
            pass
        self.initproxy()
        end_time()

    def xiciproxy(self, page):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            print("Please input num!")
            return []

        begin_time()
        host = 'http://www.xicidaili.com/nn/'
        for index in range(1, page + 1):
            html = get_html(host + str(index), {}, host[7:-4])
            # html = self.get_request_proxy(host + str(index), host[7:-4], 0)
            tem = html.find_all('tr')
            for index in range(1, len(tem)):
                tds = tem[index].find_all('td')
                ip = tds[5].text.lower()
                self.waitjudge.append(ip + '://' + tds[1].text + ':' +
                                      tds[2].text)
        self.threadjude()
        end_time()

    def gatherproxy(self, types):
        """
        :100: very nice website
        first of all you should download proxy ip txt from:
        http://www.gatherproxy.com/zh/proxylist/country/?c=China
        """

        begin_time()
        file_d = open('proxy/gatherproxy', 'r')
        for index in file_d.readlines():
            if types == 0:
                self.waitjudge.append('http://' + index[0:-1])
            elif types == 1:
                self.waitjudge.append('https://' + index[0:-1])
            else:
                self.waitjudge.append('http://' + index[0:-1])
                self.waitjudge.append('https://' + index[0:-1])
        self.threadjude()
        end_time()

    def goubanjia(self):
        """
        :-1: html tag mixed with invalid data
        :100:And the most important thing is the port writed in 'class' rather in text.
        The website is difficult to spider, but the proxys are very goog
        goubanjia proxy http://www.goubanjia.com
        """

        begin_time()
        host = 'http://www.goubanjia.com'
        html = self.get_request_proxy(host, host[7:], 0)

        if not html:
            return []
        trs = html.find_all('tr', class_=['warning', 'success'])
        for tr in trs:
            tds = tr.find_all('td')
            ip = tds[2].find_all('a')[0].text + '://'
            iplist = tds[0].find_all(['div', 'span', not 'p'],
                                     class_=not 'port')
            for index in iplist:
                ip += index.text
            encode = tds[0].find_all(['div', 'span', 'p'],
                                     class_='port')[0]['class'][1]
            uncode = functools.reduce(
                lambda x, y: x * 10 + (ord(y) - ord('A')),
                map(lambda x: x, encode), 0)
            self.waitjudge.append(ip + ':' + str(int(uncode / 8)))
        self.threadjude()
        end_time()

    def schedulegou(self):
        sched = BlockingScheduler()
        sched.add_job(self.goubanjia, 'interval', seconds=100)
        sched.start()

    def data5u(self):
        """
        data5u proxy http://www.data5u.com/
        no one can use
        """

        begin_time()
        url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml']
        host = 'http://www.data5u.com/'
        for uri in url_list:
            html = self.get_request_proxy(host + uri, host[7:-1], 0)
            if not html:
                continue
            table = html.find_all('ul', class_='l2')
            for index in table:
                tds = index.find_all('li')
                ip = tds[3].text
                self.waitjudge.append(ip + '://' + tds[0].text + ':' +
                                      tds[1].text)
        self.threadjude()
        end_time()

    def sixsixip(self, area, page):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                print(str(index) + ' ' + str(pageindex))
                work = threading.Thread(target=self.sixsixthread,
                                        args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time()

    def sixsixthread(self, index, pageindex):
        host = '''http://www.66ip.cn/areaindex_%d/%d.html'''
        html = self.get_request_proxy(host % (index, pageindex), host[7:-21],
                                      0)
        if not html:
            return []
        trs = html.find_all('table')[2].find_all('tr')
        for test in range(1, len(trs) - 1):
            tds = trs[test].find_all('td')
            self.waitjudge.append('http://' + tds[0].text + ':' + tds[1].text)
            self.waitjudge.append('https://' + tds[0].text + ':' + tds[1].text)

    def kuaidaili(self, page):
        """
        kuaidaili https://www.kuaidaili.com/free/
        """

        begin_time()
        threadings = []
        for index in range(1, page + 1):
            work = threading.Thread(target=self.kuaidailithread,
                                    args=(index, ))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time()

    def kuaidailithread(self, index):
        host = '''https://www.kuaidaili.com/free/inha/%d/'''
        html = self.get_request_proxy(host % index, host[8:25], 0)
        if not html:
            return []
        trs = html.find_all('tr')
        for index in range(1, len(trs)):
            tds = trs[index].find_all('td')
            ip = tds[3].text.lower() + "://" + tds[0].text + ':' + tds[1].text
            self.waitjudge.append(ip)
Пример #19
0
class TitleViews(object):
    """
    update title views
    """
    def __init__(self):
        self.Db = Db("blog")
        self.local_views = {}
        self.title_map = {}
        self.title2slug = {}
        self.failured_map = {}
        self.zhihu_views = {}
        self.zhihu_id = {}
        self.jianshu_views = {}
        self.jianshu_id = {}
        self.csdn_views = {}
        self.csdn_id = {}
        self.exist_data = {}
        self.getTitleMap()
        self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s'''
        self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s'''
        self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''

    def loadLocalView(self):
        """
        load local view
        """
        if not os.path.exists("%sgoogle" % data_dir):
            return
        with codecs.open("%sgoogle" % data_dir, 'r', encoding='utf-8') as f:
            test = f.readlines()
        test = test[7:]
        for index in test:
            arr = index.split(',')
            slug = self.matchSlug(arr[0])
            if slug is None or slug not in self.title_map:
                continue
            print(slug + ' ' + str(arr[1]) + ' ' + arr[0])
            if slug in self.local_views:
                self.local_views[slug] += int(arr[1])
            else:
                self.local_views[slug] = int(arr[1])

    def getTitleMap(self):
        """
        get title map
        """
        if os.path.exists('%sslug' % data_dir):
            with codecs.open('%sslug' % data_dir, 'r', encoding='utf-8') as f:
                slug = f.readlines()
        else:
            slug = []
        if os.path.exists('%stitle' % data_dir):
            with codecs.open('%stitle' % data_dir, 'r', encoding='utf-8') as f:
                title = f.readlines()
        else:
            title = []
        self.title_map = {
            tempslug.split('"')[1]: title[num].split('"')[1]
            for num, tempslug in enumerate(slug)
        }
        title2slug = {
            self.title_map[index]: index
            for index in self.title_map.keys()
        }
        noemoji_title = {
            self.filter_emoji(self.title_map[index]).replace('\u200d', ''):
            index
            for index in self.title_map.keys()
        }
        self.title2slug = {**noemoji_title, **title2slug}

    def matchSlug(self, pattern):
        """
        match slug
        """
        arr = re.search(r'\/([^\/]+).html', pattern)
        return None if arr is None else arr.group(1)

    def getZhihuView(self):
        if os.path.exists('%scookie' % data_dir):
            with codecs.open('%scookie' % data_dir, 'r',
                             encoding='utf-8') as f:
                cookie = f.readline()
        else:
            cookie = ' '
        changeCookie(cookie[:-1])
        url_basic = [
            'https://www.zhihu.com/api/v4/creator/content_statistics/',
            'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=',
            datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no='
        ]
        url = "".join(url_basic)
        json = self.get_request(url + '1', 1)
        if not json:
            return
        if not 'data' in json:
            if 'code' in json:
                print(json)
            return
        for index in json['data']:
            zhihu_title = index['title']
            zhihu_id = int(index['url_token'])
            zhihu_count = int(index['read_count'])

            if zhihu_title in self.title2slug:
                temp_slug = self.title2slug[zhihu_title]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            elif zhihu_id in self.zhihu_id_map:
                temp_slug = self.zhihu_id_map[zhihu_id]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            else:
                print(index['title'])

        for index in range(json['count'] // 10):
            print('zhihu', index)
            json = self.get_request(url + str(index + 2), 1)
            if not json:
                continue
            for index in json['data']:
                zhihu_title = index['title']
                zhihu_id = int(index['url_token'])
                zhihu_count = int(index['read_count'])

                if zhihu_title in self.title2slug:
                    temp_slug = self.title2slug[zhihu_title]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                elif zhihu_id in self.zhihu_id_map:
                    temp_slug = self.zhihu_id_map[zhihu_id]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                else:
                    print(index['title'])

    def get_request(self, url, types):

        result = basic_req(url, 1)

        if not result:
            if can_retry(url):
                self.get_request(url, types)
            return
        return result

    def get_request_v2(self, url, types, header):

        result = get_request_proxy(url, 0, header=header)

        if not result or not len(result.find_all('div', class_='content')):
            if can_retry(url):
                self.get_request_v2(url, types, header)
            return
        return result

    def get_request_v3(self, url, types):

        result = basic_req(url, 0)

        if result is None or not result or not len(
                result.find_all('p', class_='content')):
            if can_retry(url):
                self.get_request_v3(url, types)
            return
        return result

    def getJianshuViews(self):
        """
        get jianshu views
        """
        header = {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'zh-CN,zh;q=0.9',
            'cache-control':
            'no-cache',
            'pragma':
            'no-cache',
            'sec-ch-ua':
            'Google Chrome 75',
            'sec-fetch-dest':
            'document',
            'sec-fetch-mode':
            'navigate',
            'sec-fetch-site':
            'cross-site',
            'sec-fetch-user':
            '******',
            'sec-origin-policy':
            '0',
            'upgrade-insecure-requests':
            '1',
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3736.0 Safari/537.36'
        }

        basic_url = 'https://www.jianshu.com/u/2e0f69e4a4f0'

        for rounds in range(1, 4):
            url = basic_url if rounds == 1 else basic_url + \
                '?order_by=shared_at&page=' + str(rounds)
            print(url)
            html = self.get_request_v2(url, 0, header)
            if html is None:
                print('None')
                return
            for index in html.find_all('li', class_=["", 'have-img']):
                if len(index.find_all('i')) < 3:
                    continue
                title = index.find_all('a', class_='title')[0].text.replace(
                    '`', '')
                jianshu_id = int(index['data-note-id'])
                jianshu_count = int(index.find_all('a')[-2].text)
                if title in self.title2slug:
                    temp_slug = self.title2slug[title]
                    self.jianshu_id[temp_slug] = jianshu_id
                    self.jianshu_views[temp_slug] = jianshu_count
                elif jianshu_id in self.jianshu_id_map:
                    temp_slug = self.jianshu_id_map[jianshu_id]
                    self.jianshu_id[temp_slug] = jianshu_id
                    self.jianshu_views[temp_slug] = jianshu_count
                else:
                    print(title)

    def getCsdnViews(self):
        """
        get csdn views
        """

        basic_url = "https://blog.csdn.net/iofu728"

        for index in range(1, 3):
            url = basic_url if index == 1 else basic_url + \
                '/article/list/' + str(index) + '?'

            html = self.get_request_v3(url, 0)
            if html is None:
                print('None')
                return
            for div_lists in html.find_all(
                    'div', class_='article-item-box csdn-tracking-statistics'):
                if 'style' in div_lists.attrs:
                    continue
                csdn_id = int(div_lists['data-articleid'])
                title = div_lists.a.contents[2].replace('\n',
                                                        '').strip().replace(
                                                            '`', '')
                csdn_count = int(
                    div_lists.find_all('span', class_='read-num')[0].span.text)
                if title in self.title2slug:
                    temp_slug = self.title2slug[title]
                    self.csdn_id[temp_slug] = csdn_id
                    self.csdn_views[temp_slug] = csdn_count
                elif csdn_id in self.csdn_id_map:
                    temp_slug = self.csdn_id_map[csdn_id]
                    self.csdn_id[temp_slug] = csdn_id
                    self.csdn_views[temp_slug] = csdn_count
                else:
                    print(title)

    def filter_emoji(self, desstr, restr=''):
        '''
        filter emoji
        '''
        desstr = str(desstr)
        try:
            co = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
        return co.sub(restr, desstr)

    def init_db(self):
        self.loadLocalView()
        self.getZhihuView()
        self.getJianshuViews()
        self.getCsdnViews()
        insert_list = []
        for index in self.title_map.keys():
            insert_list.append(
                (index,
                 self.local_views[index] if index in self.local_views else 0,
                 self.zhihu_views[index] if index in self.zhihu_views else 0,
                 self.csdn_views[index] if index in self.csdn_views else 0,
                 self.jianshu_views[index] if index in self.jianshu_views else
                 0, self.zhihu_id[index] if index in self.zhihu_id else 0,
                 self.csdn_id[index] if index in self.csdn_id else 0,
                 self.jianshu_id[index] if index in self.jianshu_id else 0))
        # return insert_list
        results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1])
        if results:
            if len(insert_list):
                print('Insert ' + str(len(insert_list)) + ' Success!')
        else:
            pass

    def select_all(self):
        result = self.Db.select_db(
            "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0"
        )
        if result == False:
            print("SELECT Error!")
        else:
            self.exist_data = {index[1]: list(index) for index in result}
            self.zhihu_id_map = {
                index[6]: index[1]
                for index in result if index[6]
            }
            self.csdn_id_map = {
                index[7]: index[1]
                for index in result if index[7]
            }
            self.jianshu_id_map = {
                index[8]: index[1]
                for index in result if index[8]
            }
            for index in self.exist_data:
                self.exist_data[index][-1] = self.exist_data[index][
                    -1].strftime('%Y-%m-%d %H:%M:%S')

    def update_view(self):
        changeHtmlTimeout(10)
        wait_map = {}
        self.select_all()
        self.getZhihuView()
        self.getJianshuViews()
        self.getCsdnViews()
        for index in self.zhihu_views.keys():
            if self.zhihu_views[index] == self.exist_data[index][
                    3] and self.zhihu_id[index] == self.exist_data[index][6]:
                continue
            wait_map[index] = self.exist_data[index]
            wait_map[index][3] = self.zhihu_views[index]
            wait_map[index][6] = self.zhihu_id[index]
        for index in self.csdn_views.keys():
            if self.csdn_views[index] == self.exist_data[index][
                    4] and self.csdn_id[index] == self.exist_data[index][7]:
                continue
            if index not in wait_map:
                wait_map[index] = self.exist_data[index]
            wait_map[index][4] = self.csdn_views[index]
            wait_map[index][7] = self.csdn_id[index]
        for index in self.jianshu_views.keys():
            if self.jianshu_views[index] == self.exist_data[index][
                    5] and self.jianshu_id[index] == self.exist_data[index][8]:
                continue
            wait_map[index] = self.exist_data[index]
            wait_map[index][5] = self.jianshu_views[index]
            wait_map[index][8] = self.jianshu_id[index]
        update_list = [tuple(index) for index in wait_map.values()]
        # return update_list:q
        if not len(update_list):
            return
        results = self.Db.update_db(self.update_sql % str(update_list)[1:-1])
        if results:
            if len(update_list):
                print('Update ' + str(len(update_list)) + ' Success!')
        else:
            pass

    def new_day(self):
        day_data = self.Db.select_db(
            "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1"
        )
        if not os.path.exists('../blog/log/basic'):
            print('File not exist!!!')
            return
        with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f:
            existed_spider = int(f.readlines()[1])
        today_date = datetime.datetime.now().strftime('%Y-%m-%d')
        new_day_list = [(today_date, day_data[0][0] + day_data[0][1],
                         existed_spider)]
        results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1])
        if results:
            if len(new_day_list):
                print('New day update' + str(len(new_day_list)) + ' Success!')
        else:
            pass
Пример #20
0
\N{ROBOT FACE} Downloading the full #StackOverflow history from the @waybackmachine for {duration} now

At this point I have read {fcount} files
"""


def timedelta_format(td):
    d = td.days
    s = td.seconds
    h, s = divmod(s, HOUR)
    m, s = divmod(s, MINUTE)

    return "{} days, {} hours and {} minutes".format(d, h, m)


db = Db(DB_URI, timeout=DB_TIMEOUT)
while True:
    try:
        orig = datetime(2020, 2, 1, 4, 17)
        delta = datetime.today() - orig
        ts = timedelta_format(delta)

        fcount = db.fcount()

        msg = MSG.format(duration=ts, fcount=fcount)
        print()
        print(msg)
        TWEET_CMD = ['t', 'update', 'XXXX']
        TWEET_CMD[-1] = msg
        subprocess.run(TWEET_CMD)
    except Exception as e:
Пример #21
0
 def setUp(self):
     self.db_to_test = Db()
     pass