예제 #1
0
파일: acquire2.py 프로젝트: qianOU/feiyan
def get_detail(df, cik):
    result = df.date1.to_frame().copy()

    info = list()
    for i, (date, url, type_) in enumerate(df.values):
        headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=timeout)
        if response.status_code == 200:
            print('{}的{}的报表获取成功!'.format(cik, date))
            if '<?xml' in response.text[:400]:
                xml = etree.fromstring(response.content)
                text = etree.tostring(xml, encoding='unicode')
            else:
                text = str(pq.PyQuery(response.text).text())
            temp = word_count(text)
            temp['date1'] = date
            temp['link'] = url
            temp['type'] = type_
            one = pd.DataFrame(temp, index=[i])
            info.append(one)
    if len(info) > 0:
        agg = pd.concat(info, axis=0)
        answer = pd.merge(result, agg, how='left', on='date1')
        return answer
    return cik
예제 #2
0
 def _parse_catalog(self):
     """
     请求self.url,获取小说目录页面内容
     :return: 所有详细页面的链接
     """
     result = CommonTool.fetch_page(self.catalog_url)
     doc = pq.PyQuery(result)
     # 内存去重
     detail_urls = set()
     # 模式1 https://www.kanunu8.com/book3/8257/
     for a in doc('table:nth-child(2) > tbody > tr > td > a').items():
         detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href)
         if detail_url in detail_urls:
             # 去重
             continue
         if self.HOST not in detail_url:
             # 不是该站点链接
             continue
         detail_urls.add(detail_url)
     # 模式2 https://www.kanunu8.com/book2/10946/index.html
     for a in doc('div.col-left > div > dl > dd > a').items():
         detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href)
         if detail_url in detail_urls:
             # 去重
             continue
         if self.HOST not in detail_url:
             # 不是该站点链接
             continue
         detail_urls.add(detail_url)
     return detail_urls
예제 #3
0
def getSpinDetails(url, source):
    d = pyquery.PyQuery(urlread(url))
    spin = {
        'name': '',
        'summary': '',
        'description': '',
        'releaseDate': '',
        'logo': 'qrc:/logo-fedora.svg',
        'screenshots': [],
        'source': '',
        'variants': {
            '': dict(url='', sha256='', size=0)
        }
    }
    spin['source'] = source

    spin['name'] = d('title').html().strip()
    screenshot = d('img').filter('.img-responsive').attr('src')
    if screenshot:
        spin['screenshots'].append(url + "/.." + screenshot)

    for i in d('div').filter('.col-sm-8').html().split('\n'):
        #line = i.strip().replace('<p>', '').replace('</p>', '')
        line = i.strip()
        if len(line):
            spin['description'] += line

    download = getDownload(url + "/.." + d('a.btn').attr('href'))
    spin['variants'] = download
    #spin['release'] = getRelease(download)

    if 'KDE Plasma' in spin['name']:
        spin['logo'] = 'qrc:/kde_icon.png'
    if 'Xfce' in spin['name']:
        spin['logo'] = 'qrc:/xfce_icon.png'
    if 'LXDE' in spin['name']:
        spin['logo'] = 'qrc:/lxde_icon.png'
    if 'MATE' in spin['name']:
        spin['logo'] = 'qrc:/mate_icon.png'
    if 'SoaS' in spin['name']:
        spin['logo'] = 'qrc:/soas_icon.png'

    if 'Astronomy' in spin['name']:
        spin['logo'] = 'qrc:/astronomy_icon_green.png'
    if 'Design' in spin['name']:
        spin['logo'] = 'qrc:/design-suite_icon_green.png'
    if 'Games' in spin['name']:
        spin['logo'] = 'qrc:/games_icon_green.png'
    if 'Jam' in spin['name']:
        spin['logo'] = 'qrc:/jam_icon_green.png'
    if 'Robotics' in spin['name']:
        spin['logo'] = 'qrc:/robotics-suite_icon_green.png'
    if 'Scientific' in spin['name']:
        spin['logo'] = 'qrc:/scientific_icon_green.png'
    if 'Security' in spin['name']:
        spin['logo'] = 'qrc:/security-lab_icon_green.png'

    return spin
예제 #4
0
def getSpins(url, source):
    d = pyquery.PyQuery(urlread(url))
    spins = []

    for i in d('div').filter('.high').items('span'):
        spinUrl = url + i.siblings()('a').attr('href')
        spin = getSpinDetails(spinUrl, source)
        spin['summary'] = i.html()
        spins.append(spin)

    return spins
예제 #5
0
 def _parse_detail(content):
     """
     解析页面详细内容,提取并返回 标题+正文
     :param content:  小说内容页面
     :return: 标题+正文
     """
     doc = pq.PyQuery(content)
     title = doc(
         '#wrapper > div.content_read > div > div.bookname > h1').text()
     title = CommonTool.fix_title(title)
     content = doc('#content').text()
     return title, content
예제 #6
0
 def _parse_detail(content):
     """
     解析页面详细内容,提取并返回 标题+正文
     :param content:  小说内容页面
     :return: 标题+正文
     """
     doc = pq.PyQuery(content)
     title = doc('#directs > div.bookInfo > h1 > strong').text().replace(
         "正文", "").strip()
     title = CommonTool.fix_title(title)
     content = doc('#content').text()
     content = content.replace('style6();', '').replace('style5();', '')
     return title, content
예제 #7
0
 def get_kuaidaili_proxies(pages=5):
     for page in range(1,pages+1):
         url = "http://www.kuaidaili.com/free/inha/{0}/".format(page)
         response = requests.get(url,headers=HEADERS)
         if response.status_code == 200:
             pq = pyquery.PyQuery(response.text)
         else:
             print(response.status_code)
             return
         for item in pq("tbody > tr"):
             td = item.findall('td')
             yield td[0].text+":"+td[1].text
         time.sleep(5)
예제 #8
0
def getProductDetails(url, name):
    d = pyquery.PyQuery(urlread(url))
    product = {
        'name': '',
        'summary': '',
        'description': '',
        'releaseDate': '',
        'logo': 'qrc:/logo-fedora.svg',
        'screenshots': [],
        'source': '',
        'variants': {
            '': dict(url='', sha256='', size=0)
        }
    }
    product['name'] = name
    product['source'] = name

    product['summary'] = d('h1').html()

    for i in d(
            'div.col-md-8, div.col-sm-8, div.col-md-5, div.col-md-6, div.col-sm-5, div.col-sm-6'
    ).items('p, h3, h2'):
        i.remove('a, br, img')
        if i.parent().parent()('blockquote'):
            i = i.parent().parent()('blockquote')
            product['description'] += '<blockquote>'
            product['description'] += str(i('p'))
            product['description'] += '<p align=right> ― <em>' + i(
                'cite').html() + '</em></p>'
            product['description'] += '</blockquote>'
        elif i.html() and len(i.html(
        )) > 0:  # can't remove empty tags with :empty for some reason
            product['description'] += str(i)
            product['description'].replace('h2', 'h4')
            product['description'].replace('h3', 'h4')

    if name == "Workstation":
        product['logo'] = 'qrc:/logo-color-workstation.png'
    if name == "Cloud":
        product['logo'] = 'qrc:/logo-color-cloud.png'
    if name == "Server":
        product['logo'] = 'qrc:/logo-color-server.png'

    download = getDownload(url + "/download")
    product['variants'] = download
    #product['release'] = getRelease(download)

    return product
예제 #9
0
def getDownload(url):
    d = pyquery.PyQuery(urlread(url))
    ret = dict()
    url = d('a.btn-success').attr('href')
    ret[getArch(url)] = dict(
        url=url,
        sha256=getSHA(url),
        size=getSize(d('a.btn-success').parent().parent()('h5').text()))
    for e in d.items("a"):
        if "32-bit" in e.html().lower() and e.attr("href").endswith(".iso"):
            altUrl = e.attr("href")
            ret[getArch(altUrl)] = dict(url=altUrl,
                                        sha256=getSHA(altUrl),
                                        size=getSize(e.text()))
            break
    return ret
예제 #10
0
 def translate(self, text, target_language='ru', source_language='auto'):
     headers = {'User-Agent': self.user_agent}
     params = {
         'q': text,
         'hl': target_language,
         'sl': source_language,
         'ie': 'UTF-8',
         'prev': '_m'
     }
     response = requests.get(GOOGLE_URL, params=params, headers=headers)
     if response.status_code != requests.codes.ok:
         print(response.reason)
         exit(1)
     pq = pyquery.PyQuery(response.text)
     translated = pq.find('div.t0').text()
     return translated
예제 #11
0
def getSHA(url):
    baseurl = '/'.join(url.split('/')[:-1])
    filename = url.split('/')[-1]
    d = pyquery.PyQuery(urlread(baseurl))
    checksum = ''
    for i in d.items('a'):
        if 'CHECKSUM' in i.attr('href'):
            checksum = urlread(baseurl + '/' + i.attr('href'))
            break

    for line in checksum.split('\n'):
        i = re.match(r'^SHA256 \(([^)]+)\) = ([a-f0-9]+)$', line)
        if i:
            if i.group(1) == filename:
                return i.group(2)
    return ''
예제 #12
0
def getProducts(url='https://getfedora.org/'):
    d = pyquery.PyQuery(urlread(url))

    products = []

    for i in d('div.productitem').items('a'):
        productUrl = url
        if i.attr('href').startswith("../"):
            productUrl += i.attr('href')[3:]
        else:
            productUrl += i.attr('href')
        productName = i('h4').html()

        if productName != "Cloud":
            products.append(getProductDetails(productUrl, productName))

    return products
예제 #13
0
    def _parse_detail(content):
        """
        解析页面详细内容,提取并返回 标题+正文
        :param content:  小说内容页面
        :return: 标题+正文
        """
        doc = pq.PyQuery(content)
        title = doc('tr:nth-child(1) > td > strong > font').text()
        content = doc('td:nth-child(2) > p').text()
        if '' == title:
            # 模式1未能获取标题,采用模式2
            title = doc('#Article > h1').text()
            title = title.split('\n')[0]
            content = doc('#Article > div > p:not([align])').text()
            content = content.replace('  ', '\n')

        return title, content
예제 #14
0
 def _parse_catalog(self):
     """
     请求self.url,获取小说目录页面内容
     :return: 所有详细页面的链接
     """
     result = CommonTool.fetch_page(self.catalog_url)
     doc = pq.PyQuery(result)
     # 内存去重
     detail_urls = set()
     for a in doc('#list > dl > dd > a').items():
         detail_url = a.attr.href
         if detail_url in detail_urls:
             # 去重
             continue
         detail_url = urllib.request.urljoin(self.HOST, detail_url)
         detail_urls.add(detail_url)
     return detail_urls
예제 #15
0
 def _parse_catalog(self):
     """
     请求self.url,获取小说目录页面内容
     :return: 所有详细页面的链接
     """
     result = CommonTool.fetch_page(self.catalog_url)
     doc = pq.PyQuery(result)
     # 内存去重
     detail_urls = set()
     for a in doc(
             '#chapter > div.chapterSo > div.chapterNum > ul > div.clearfix.dirconone  li > a'
     ).items():
         detail_url = a.attr.href
         if detail_url in detail_urls:
             # 去重
             continue
         if self.HOST not in detail_url:
             # 不是该站点链接
             continue
         detail_urls.add(detail_url)
     return detail_urls
예제 #16
0
def retrieve_vine_video_url(vine_url):
    log('--Retrieving vine url')
    d = pyquery.PyQuery(url=vine_url)
    video_url = d("meta[property=twitter\\:player\\:stream]").attr['content']
    video_url = video_url.partition("?")[0]
    return video_url
예제 #17
0
    def fetch_content(self, url):
        """
        Fetches the content of an URL, gets app links from it and
        pushes them down the queue. Then parses the content to
        determine if it is an app and if it is, then push the parsed
        result in the `results` queue for later processing.

        This logic is getting executed inside green threads. You
        shouldn't spawn new green threads here, as this is not the
        parent and trouble may arise.
        """
        resp = urllib.urlopen(url)

        # silently ignores errors, even though the script will not
        # block here.
        if resp.getcode() == 404:
            return
        elif resp.getcode() != 200:
            # this is a slight problem, it shouldn't happen but it
            # does sometimes
            self.failed += 1
            return

        try:
            content = resp.read()
            doc = pq.PyQuery(content)

            # we must do our best to ignore pages that are not
            # relevant (music, movies, other pages that don't have
            # links to apps in them)
            if not self.is_page_valid(url, doc):
                return         

            # I like keeping a log of URLs processed
            sys.stderr.write(url + "\n")

            # fetches links in this page, by regular expressions. 
            # we are interested in app links and publisher links.
            all_links = [
                a.attrib['href']
                for a in doc('a') 
                if re.search(r'\/(details|developer)[?]', a.attrib.get('href', '')) \
                and not re.search('reviewId', a.attrib.get('href', '')) \
                and not re.search('accounts\/ServiceLogin', a.attrib.get('href', ''))
            ]

            # pushing new links down the queue for processing later
            for link in all_links:
                if not link: continue
                self.queue.put(self.absolute_url(link))

            # fetches app info from the fetched content, but ONLY in
            # case the URL is about an actual app
            app_info = self.fetch_app_info(url, doc)
            if app_info:
                # prevents going to already visited IDs
                self.seen_app_ids.add(app_info['uid'])                
                self.results.put(app_info)
        except:
            # we must ignore exceptions as sometimes we don't make the
            # best assumptions. Some fields may be missing, the page's
            # format can change slightly, etc... when I ran the script
            # the first time it froze halfway-through and had to start
            # all over again
            pass
예제 #18
0
            if ex.code == 404:
                return

            # this is a slight problem, it shouldn't happen but it
            # does sometimes, so keeping tracking is useful to see how
            # often it does happen
            self.failed += 1
            return

        except urllib2.URLError:
            self.failed += 1
            return

        try:
            content = resp.read()
            doc = pq.PyQuery(content)

            # we must do our best to ignore pages that are not
            # relevant (music, movies, other pages that don't have
            # links to apps in them)
            if not self.is_page_valid(url, doc):
                return

            # I like keeping a log of URLs processed
            sys.stderr.write(url + "\n")

            # fetches links in this page, by regular expressions.
            # we are interested in app links and publisher links.
            all_links = [
                a.attrib['href']
                for a in doc('a')
예제 #19
0
    def getCookie(self, username='******', passwd='gd19691818'):
        count = 1
        # 模拟浏览器登入
        from selenium import webdriver
        from selenium.common.exceptions import TimeoutException
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.support.wait import WebDriverWait

        url = 'https://login.taobao.com/member/login.jhtml?'

        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')  # 隐藏界面
        browser = webdriver.Chrome(chrome_options=chrome_options)

        # 添加请求头信息
        # dcap = dict(DesiredCapabilities.CHROME)
        # dcap['chorme.page.settings.userAgent'] = self.ua
        # browser = webdriver.Chrome(desired_capabilities=dcap)
        # browser = webdriver.Chrome()
        # browser = webdriver.Ie(desired_capabilities=dcap)
        # browser = webdriver.Ie()# ie 浏览器
        wait = WebDriverWait(browser, 10)
        browser.get(url)
        browser.maximize_window()

        # 选择以用户名,密码的方式进行登入,(找到目标后点击,不然抓取不到输入窗口)
        element = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_Quick2Static')))

        element.click()
        # time.sleep(1)

        # 获取输入用户名、密码,已及登入的按钮
        input_username = wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, '#TPL_username_1')))
        input_passwd = wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, '#TPL_password_1')))

        # 清空输入框里面的内容
        input_username.clear()
        input_passwd.clear()
        # time.sleep(random.random())
        # 填写用户名,密码数据
        input_username.send_keys(username)
        input_passwd.send_keys(passwd)
        # time.sleep(random.random())

        # 循环拖动验证码,若没有出现‘哎呀。。。’字样表示验证通过,中断循环
        while True:
            # time.sleep(10)
            slider = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#nc_1_n1z')))
            action = ActionChains(browser)
            # action.click_and_hold(slider)# 点击并按住
            for index in range(10):
                try:
                    # action.move_by_offset(index * 50, 0).perform()
                    # 水平拖动500
                    action.drag_and_drop_by_offset(slider, 500,
                                                   0).perform()  # 平滑
                except Exception:
                    # 拖动超过报异常,中断循环
                    break
            # action.release().perform()
            error = pyquery.PyQuery(browser.page_source)('.nc-lang-cnt').text()
            print(error)

            if error.startswith('哎呀'):
                count += 1
                print('--------------------------------------第%s次尝试' % count)
                restart = wait.until(
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, '#nocaptcha > div > span > a')))
                restart.click()
                # time.sleep(random.random())
            else:
                break
        # time.sleep(random.random())
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_SubmitStatic')))
        submit.click()

        cookie = [
            item["name"] + "=" + item["value"]
            for item in browser.get_cookies()
        ]
        cookiestr = ';'.join(item for item in cookie)
        print(cookiestr)
        self.cookie = cookiestr
        return cookiestr
예제 #20
0
from pyquery import pyquery

doc = pyquery.PyQuery('http://www.baidu.com')
print(doc)