Пример #1
0
def diy(body, pg_connect):
    cloudpan_url = 'null'
    cloudpan_pass = '******'
    msg = eval(body)
    print('{} {}'.format(msg.keys(), msg.values()))
    fkey = list(msg.keys())[0]
    ourl = OpenUrl(msg[fkey])
    code, content = ourl.run()
    if code == 200:
        selecter = etree.HTML(content)
        try:
            cloudpan_url = selecter.xpath('//div[@class="part"]/a/@href')
            if len(cloudpan_url) == 1:
                cloudpan_url = cloudpan_url[0]
                cloudpan_pass = selecter.xpath(
                    '//div[@class="part"]/text()')[2]
            elif len(cloudpan_url) == 2:
                cloudpan_url = '|'.join(cloudpan_url)
                cloudpan_pass = '******'.join(
                    selecter.xpath('//div[@class="part"]/text()')[2:4])
            else:
                cloudpan_url = cloudpan_url[0]
                cloudpan_pass = cloudpan_pass[0]
        except:
            pass
        send_pg(pg_connect, [fkey, cloudpan_url, cloudpan_pass])
    else:
        pass
    time.sleep(0.5)
Пример #2
0
 def get_download_url(self):
     redis_keys = self.__redis_link.keys()
     for fkey in redis_keys:
         url = self.__redis_link.get(fkey)
         ourl = OpenUrl(url)
         code, content = ourl.run()
         if code == 200:
             selecter = etree.HTML(content)
             try:
                 cloudpan_url = selecter.xpath(
                     '//div[@class="part"]/a/@href')
                 if len(cloudpan_url) == 1:
                     cloudpan_url = cloudpan_url[0]
                     cloudpan_pass = selecter.xpath(
                         '//div[@class="part"]/text()')[2]
                 elif len(cloudpan_url) == 2:
                     cloudpan_url = '|'.join(cloudpan_url)
                     cloudpan_pass = '******'.join(
                         selecter.xpath('//div[@class="part"]/text()')[2:4])
                 else:
                     logger.error(
                         '[{}] donot has cloudpan download link...'.format(
                             fkey.decode()))
                     continue
             except:
                 logger.error('[{}] miss something..'.format(fkey.decode()))
                 continue
             self.send_pg([fkey, cloudpan_url, cloudpan_pass])
         else:
             logger.error('[%s] can not open the download page..' %
                          fkey.decode())
             continue
         time.sleep(0.5)
 def _get_content(self,url):
     ourl = OpenUrl(url)
     code, content = ourl.run()
     if code == 200:
         return content
     else:
         return None
Пример #4
0
def get_pages(start_url):
    url = start_url + '.html'
    ourl = OpenUrl(url)
    code, html = ourl.run()
    if code == 200:
        selecter = etree.HTML(html)
        pages_url = selecter.xpath('//div[@class="page"]/a/@href')[-1]
        pages = int(re.split('[/|.|-]', pages_url)[3])
    else:
        logger.error('get [{0}] failed: [{1}]'.format(url, code))
        pages = None
    return pages
Пример #5
0
 def gethtml(self, url):
     '''
        获取html文件
        返回url的列表
     '''
     ob_openurl = OpenUrl(url)
     code, html = ob_openurl.run()
     if code == 200:
         return html
     else:
         print('open [{}] failed..'.format(url))
         return None
Пример #6
0
def get_price():
    ourl = OpenUrl('https://training.linuxfoundation.cn/certificate/details/1')
    code, html = ourl.run()
    if code == 200:
        selecter = etree.HTML(html)
        try:
            tmp = str(
                selecter.xpath('//span[@class="text-red mr-2 text-sm"]/text()')
                [0])

            return int(float(tmp.replace(',', '')))
        except:
            return None
Пример #7
0
def getMovieInfo(url):
    full_url = 'https://www.dytt8.net/' + url
    ourl = OpenUrl(full_url,'gb2312')
    code,html = ourl.openurl()
    info = {}
    if code==200:
        selecter = etree.HTML(html)
        try:
            info['name'] = selecter.xpath("//div[@class='title_all']/h1/font/text()")[0]
            info['public_time'] = selecter.xpath("//div[@class='co_content8']/ul/text()")[0].strip().split(':')[1]
            info['downlink'] = selecter.xpath("//tbody/tr/td/a/text()")[0]
            return info
        except:
            return None
Пример #8
0
def get_useful_url(start_url, redis_conn):
    all_page = get_pages(start_url)
    for page in range(1, all_page):
        if page == 1:
            url = start_url + '.html'
        else:
            url = start_url + '-' + str(page) + '.html'
        ourl = OpenUrl(url)
        code, html = ourl.run()
        if code == 200:
            selecter = etree.HTML(html)
            for urls in selecter.xpath('//a/@href'):
                if urls.startswith('/html'):
                    print(urls)
                    redis_conn.lpush('simi', urls)
        else:
            logger.error('get [{0}] failed: [{1}]'.format(url, code))
Пример #9
0
def get_img(redis_conn, url):
    ourl = OpenUrl('https://se.haodd92.com/' + url.decode('utf-8'))
    code, html = ourl.run()
    if code == 200:
        selecter = etree.HTML(html)
        img_url_list = selecter.xpath(
            '//div[@class="center margintop border clear main"]/img/@src')
        for img_url in img_url_list:
            time.sleep(0.5)
            img_name = img_url.split('/')[-1]
            local = 'image/{}'.format(img_name)
            try:
                r = requests.get(img_url, stream=True)
                with open(local, 'wb') as f:
                    f.write(r.content)
                logger.info('download [{0}] to [{1}] successfully'.format(
                    img_url, img_name))
            except Exception as e:
                logger.error('download [{0}] to [{1}] failed: [{2}]'.format(
                    img_url, img_name, e))
Пример #10
0
def get_url(ftype, sender):
    main_url = 'https://www.hanfan.cc/'
    ourl = OpenUrl(main_url + ftype)
    code, main_content = ourl.run()
    if code == 200:
        selecter = etree.HTML(main_content)
        pages = int(
            selecter.xpath(
                '/html/body/section/div[1]/div/div[2]/ul/li[8]/span/text()')
            [0].split(' ')[1])
    else:
        print("bad url: {}".format(main_url))
        sys.exit()
    for page in range(1, pages):
        page_url = main_url + ftype + '/page/%s/' % page
        sub_ourl = OpenUrl(page_url)
        sub_code, sub_content = sub_ourl.run()
        if sub_code == 200:
            selecter = etree.HTML(sub_content)
            selecter_list = selecter.xpath('//article/header/h2/a')
            for link in selecter_list:
                name = link.text
                sub_url = link.attrib['href'] + '#prettyPhoto/0/'
                sender.send_date(str({name: sub_url}))

        else:
            continue
        time.sleep(1)
Пример #11
0
    def get_url(self, ftype):
        ourl = OpenUrl(self.main_url + ftype)
        code, main_content = ourl.run()
        if code == 200:
            selecter = etree.HTML(main_content)
            pages = int(
                selecter.xpath(
                    '/html/body/section/div[1]/div/div[2]/ul/li[8]/span/text()'
                )[0].split(' ')[1])
        else:
            logger.error("bad url: %s" % self.main_url)
            sys.exit()

        for page in range(1, pages):
            page_url = self.main_url + ftype + '/page/%s/' % page
            sub_ourl = OpenUrl(page_url)
            sub_code, sub_content = sub_ourl.run()
            if sub_code == 200:
                selecter = etree.HTML(sub_content)
                selecter_list = selecter.xpath('//article/header/h2/a')
                for link in selecter_list:
                    name = link.text
                    sub_url = link.attrib['href'] + '#prettyPhoto/0/'
                    self.__redis_link.set(name, sub_url, ex=21600)
            else:
                logger.error('[%s] can not open...' % page_url)
                continue

            time.sleep(1)
Пример #12
0
@Author  :   Kellan Fan 
@Version :   1.0
@Contact :   [email protected]
@Desc    :   None
'''

# here put the import lib
from lxml import etree
from misc.openurl import OpenUrl
from misc.pg_client import Mypostgres
from log.create_logger import create_logger

logger = create_logger()

url = 'http://www.310win.com/shuangseqiu/tubiao_lshm.html'
ourl = OpenUrl(url)
code, doc = ourl.run()
pg_conn = Mypostgres()
s_sql = 'select opendate from shuang_se_qiu order by opendate desc limit 1'
last_time = pg_conn.execute(s_sql)
if code == 200:
    selecter = etree.HTML(doc)
    info_list = selecter.xpath("//span[@id='spnHidValue']/text()")
    for item in info_list[0].split('#'):
        item_info = item.split('+')
        if len(item_info) > 1:
            opendate = item_info[0].split('&')[1]
            issue_num = item_info[1]
            r_nunber, b_number = item_info[2].split('|')
            if opendate > last_time[0][0]:
                sql = "insert into shuang_se_qiu(opendate, issue_num, r_number, b_number) values (%s,%s,%s,%s)"
Пример #13
0
from log.create_logger import create_logger

logger = create_logger()
base_url = 'http://wufazhuce.com/article/'

mongo_client = pymongo.MongoClient("mongodb://mongodb:27017/")
db = mongo_client["spider"]
coll = db["one"]
cur_last_id = list(coll.find().sort('article_id'))[-1]['article_id']

fail_time = 0

while True:
    cur_last_id += 1
    data = {}
    ourl = OpenUrl(base_url + str(cur_last_id))
    code, doc = ourl.run()
    if code == 200:
        soup = BeautifulSoup(doc, 'lxml')
        data['article_id'] = cur_last_id
        data["title"] = soup.find('h2', class_='articulo-titulo').text.strip()
        data["autor"] = soup.find('p', class_='articulo-autor').text.strip()
        data["content"] = soup.find('div',
                                    class_='articulo-contenido').text.strip()

        try:
            coll.insert_one(data)
            logger.info("insert [{}] successful".format(data["title"]))
        except Exception as e:
            logger.error("insert [{0}] failed: [{1}]".format(data["title"], e))
            continue
Пример #14
0
    movie_url.remove('/html/gndy/dyzz/index.html')    
    return movie_url

def getMovieInfo(url):
    full_url = 'https://www.dytt8.net/' + url
    ourl = OpenUrl(full_url,'gb2312')
    code,html = ourl.openurl()
    info = {}
    if code==200:
        selecter = etree.HTML(html)
        try:
            info['name'] = selecter.xpath("//div[@class='title_all']/h1/font/text()")[0]
            info['public_time'] = selecter.xpath("//div[@class='co_content8']/ul/text()")[0].strip().split(':')[1]
            info['downlink'] = selecter.xpath("//tbody/tr/td/a/text()")[0]
            return info
        except:
            return None

if __name__ == "__main__":
    start_url='https://www.dytt8.net/'
    ourl = OpenUrl(start_url)
    code,html = ourl.openurl()
    if code == 200:
        info_list = []
        movie_list = getMovieUrl(html)
        for url in movie_list:
            tmp = getMovieInfo(url)
            if tmp:
                info_list.append(tmp)
        print(info_list)
Пример #15
0
            info['name'] = selecter.xpath(
                "//div[@class='title_all']/h1/font/text()")[0]
            info['public_time'] = selecter.xpath(
                "//div[@class='co_content8']/ul/text()")[0].strip().split(
                    ':')[1]
            info['downlink'] = selecter.xpath("//tbody/tr/td/a/text()")[0]
            return info
        except:
            return None
    else:
        return html


if __name__ == "__main__":
    start_url = 'https://www.dytt8.net/'
    ourl = OpenUrl(start_url + 'index.htm', )
    code, html = ourl.run()
    info_list = []
    if code == 200:
        movie_list = getMovieUrl(html)
        for url in movie_list:
            tmp = getMovieInfo(url)
            time.sleep(1)
            if tmp:
                info_list.append(tmp)
    else:
        print(html)
        exit()
    postgresql = Mypostgres()
    select_cmd = 'select public_time from dian_ying_tian_tang order by public_time desc limit 1'
    last_time = postgresql.execute(select_cmd)[0][0].strip()
Пример #16
0
def gethtml(url):
    ourl = OpenUrl(url)
    code, html = ourl.run()
    if code != 200:
        html = None
    return html