예제 #1
0
def GetDouBanMovie():
    a = 1
    for i in range(0, 250, 25):
        url = "https://blog.csdn.net/nav/blockchain"
        r = requests.get(url)
        f = open('blog.txt', mode='a+')
        for blog in PyQuery(r.content)(".list_con"):
            title = PyQuery(blog).find(".csdn-tracking-statistics").find(
                'a').html()
            title.replace('\n', '')
            num = PyQuery(blog).find(".num").html()
            s = "%s: 博客:%s  阅读量:%s \n" % (a, title, num)
            f.write(s)
            a += 1
예제 #2
0
파일: pils.py 프로젝트: Nierot/niebot
    async def search(self, search_term):
        # First try finding crates with the search term as brand name
        webpage = await self.get_search({'zoeken': 'true', 'merk': search_term.replace(" ", "-"), 'kratten': 'krat-alle',
                                        'sorteer': 'prijs-oplopend'})
        url = PyQuery(webpage)('a.merkenUrl').attr('href')
        if url is None:
            # Try finding crates with the search term as search term
            webpage = await self.get_search({'zoeken': 'true', 'zoek': search_term.replace(" ", "+"),
                                            'kratten': 'krat-alle', 'sorteer': 'prijs-oplopend'})
            url = PyQuery(webpage)('a.merkenUrl').attr('href')
            if url is None:
                # Try finding other offers (not crates) with the search term as brand name
                webpage = await get_search(self, {'zoeken': 'true', 'merk': search_term.replace(" ", "-"),
                                                'sorteer': 'prijs-oplopend'})
                url = PyQuery(webpage)('a.merkenUrl').attr('href')
                if url is None:
                    # Try finding other offers with the search term as search term
                    webpage = await get_search(self, {'zoeken': 'true', 'zoek': search_term.replace(" ", "+"),
                                                    'sorteer': 'prijs-oplopend'})
                    url = PyQuery(webpage)('a.merkenUrl').attr('href')
                    if url is None:
                        # If nothing is found, we throw an exception
                        raise ValueError(search_term + ' not found, or not on sale')

        host = "https://www.biernet.nl"

        first_result = PyQuery(PyQuery(webpage)('li.cardStyle')[0])
        # Get all the various information from the HTML page
        biernet_url = host + first_result('div.item_image')('a').attr('href')
        image = host + first_result('div.item_image')('a')('img').attr('data-src')
        brand = first_result('h3.merkenH3')('a')[0].text

        product = first_result('p.artikel')('a')[0].text
        product_name = first_result('div.item_image')('a')('img').attr('title')
        original_price = first_result('p.prijs')('span.van_prijs')[0].text
        sale_price = first_result('p.prijs')('span.voor_prijs')[0].text
        sale = PyQuery(first_result('div.informatie')('li.item')[0]).text()
        sale = sale.replace('korting', 'off')
        sale_price_liter = PyQuery(first_result('div.informatie')('li.item')[1]).text()
        end_date = first_result('div.footer-item')('span')[0].text
        end_date = end_date.replace("t/m ", "").strip()

        biernet_shop_url = host + first_result('div.logo_image')('a').attr('href')
        shop_name = biernet_shop_url.split('winkel:')[-1]
        shop_name = shop_name.replace('-', ' ').title()
        shop_image = host + first_result('div.logo_image')('a')('img').attr('data-src')

        shop_url = first_result('a.bestelknop').attr('href')
        if shop_url is None:
            shop_url = biernet_shop_url

        biernet_url = urllib.parse.quote(biernet_url, safe=':/%')
        image = urllib.parse.quote(image, safe=':/%')
        shop_url = urllib.parse.quote(shop_url, safe=':/%')
        shop_image = urllib.parse.quote(shop_image, safe=':/%')

        return {'url': biernet_url, 'brand': brand, 'name': product_name, 'img': image, 'product': product,
                'shop_name': shop_name, 'shop_url': shop_url, 'biernet_shop_url': biernet_shop_url, 'shop_img': shop_image,
                'original_price': original_price, 'sale_price': sale_price, 'sale': sale, 'PPL': sale_price_liter,
                'end_date': end_date}
예제 #3
0
def get_img_urls(content):
    if not content:
        return []
    url_list = []
    doc = PyQuery(content)
    nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img')
    for node in nodeList:
        url = PyQuery(node).attr('src')
        if not url:
            continue
        if url.find('60x60') > 0:
            url = url.replace('60x60', '400x400')
            url_list.append(url)
    needDescImg = True
    if needDescImg:
        link_url = doc('div#desc-lazyload-container').attr('data-tfs-url')
        if not link_url:
            return url_list
        desc_content = fetchPageWithUrl(link_url)
        #懒惰匹配模式
        imgNodes = re.findall('<img[^<>]*>.*?', desc_content)
        #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content)
        for node in imgNodes:
            nodeQ = PyQuery(node)
            desc_url = nodeQ('img').attr('src')
            if desc_url:
                desc_url = desc_url.replace('\\"', '')
            if not desc_url:
                continue
            if 'gif' in desc_url:  #gif图片不要
                continue
            #if '//gd' in desc_url or '/2015/' in desc_url:
            url_list.append(desc_url)
    return url_list
예제 #4
0
파일: moxing.py 프로젝트: wdd257/moxing
def analysisPage(response):
    if response.status_code:
        global photos
        photos = re.findall(' zoomfile="(.*?)" ', response.text)  # 图片url
        folderName = PyQuery(response.text)("span#thread_subject").text()  # 标题
        for ch in r'\/:|<>?*"':
            folderName = folderName.replace(ch, ' ⁂ ')  # 去除特殊字符
        downHtml(response, folderName)  # 下载单页, 以方便观看
        text = PyQuery(response.text)
        # print(text)
        formhash = text("input[name='formhash']").attr('value')
        # print(formhash)
        try:
            urlPay = text("td[class='t_f'] ignore_js_op span a").attr('href')
            # print(urlPay)
            aid, tid = re.findall(r'(\d+)', urlPay)
        except TypeError:
            urlPay = text("ignore_js_op .attnm a").attr('href')
            # print(urlPay)
            aid, tid = re.findall(r'(\d+)', urlPay)
        except ValueError:
            print('该资源已经解析过了'.center(72, '-'))
            aid, tid = None, None
        except Exception:
            aid, tid = None, None
        return {
            'folderName': folderName,
            'formhash': formhash,
            'aid': aid,
            'tid': tid
        }
    print('当前网络不可用')
    return None
예제 #5
0
def get_bounds(scene_name):
    """Use Earth Explorer metadata to get bounds of a Scene"""
    url_code = get_metadata_code(scene_name)

    metadata = PyQuery(
        'http://earthexplorer.usgs.gov/fgdc/%s/%s/' % (url_code, scene_name)
        )
    metadata = metadata.text()[
        metadata.text().find('G-Ring_Latitude:'):
        metadata.text().find('\n  Keywords:')
        ]
    coords = (
        metadata.replace(' ', '')
        .replace('G-Ring_Latitude:', '')
        .replace('G-Ring_Longitude:', '')
        .split('\n')
        )
    coords = [float(coord) for coord in coords if coord != '']
    # create a list of lists with the coordinates
    coords = [coords[i:i + 2] for i in range(0, len(coords), 2)]
    # use reverse() to change [lat, lon] to [lon, lat]
    [coord.reverse() for coord in coords]
    # repeat the first coordinate on the end of the list
    if coords[0] != coords[-1]:
        coords.append(coords[0])
    return coords
예제 #6
0
def get_img_urls(content):
    if not content:
        return []
    url_list = []
    doc = PyQuery(content)
    nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img')
    for node in nodeList:
        url = PyQuery(node).attr('src')
        if not url:
            continue
        if url.find('60x60') > 0:
            url=url.replace('60x60','400x400')
            url_list.append(url)
    needDescImg = True
    if needDescImg:
        link_url = doc('div#desc-lazyload-container').attr('data-tfs-url')
        if not link_url:
           return url_list
        desc_content = fetchPageWithUrl(link_url)
        #懒惰匹配模式
        imgNodes = re.findall('<img[^<>]*>.*?', desc_content)
        #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content)
        for node in imgNodes:
            nodeQ = PyQuery(node)
            desc_url = nodeQ('img').attr('src')
            if desc_url:
                desc_url = desc_url.replace('\\"', '')
            if not desc_url:
                continue
            if 'gif' in desc_url: #gif图片不要
                continue
            #if '//gd' in desc_url or '/2015/' in desc_url:
            url_list.append(desc_url)
    return url_list
예제 #7
0
파일: utility.py 프로젝트: wagaman/jgcf
def get_content_text(content):
    # content = [s.extract() for s in content('style')]
    content_text = PyQuery(str(content)).text()
    content_text = content_text.replace('\r\n', '\n').replace('\r', '\n')
    final_content_text = ''
    for each_text in content_text.split('\n'):
        each_final_text = remove_special_char(each_text).strip()
        if each_final_text != '':
            final_content_text += each_final_text + '\n'
    return final_content_text.strip()
예제 #8
0
 def __init__(self, elem, trims, should_cleanup):
     text = PyQuery(elem).text()
     for trim in (trims or []):
         text = text.replace(trim, '')
     self.rx = re.compile(r'\W+')
     self.text = text.strip()
     self.trimmed_text = non_trimmed.sub(' ', self.text)
     self.html = PyQuery(elem).html()
     if should_cleanup:
         self.html = self.cleanup_html()
     self.normalized_text = nonword.sub('', text.lower())
예제 #9
0
    def get_key_person_info(self, key_person_info):
        key_person_info_dict = {}
        lst_key_person = []

        page = self.get_crawl_page(key_person_info)
        if page is None or page == u'':
            # 如果是抓取失败或者抓取异常,一律为None,不能影响后面的解析
            return key_person_info_dict

        json_data = util.json_loads(page)
        if json_data is None:
            return key_person_info_dict

        data_arr = json_data.get('data', [])
        if data_arr is None:
            return key_person_info_dict

        for data in data_arr:
            key_person_name_html = data.get('name', '')
            key_person_name = ''
            # 去除html代码
            if key_person_name_html != '':
                key_person_name = PyQuery(
                    key_person_name_html,
                    parser='html').remove('span').remove('div').text()

            # 职位需要进行判断
            key_position_temp = data.get('position_CN', '')
            if string.find(key_position_temp, 'img') != -1:

                pic_md5 = util.get_match_value('"', '"', key_position_temp)
                m = hashlib.md5()
                m.update(pic_md5.strip().replace('\n', ''))
                psw = m.hexdigest()
                key_position = GsModel.get_md5_key_position(psw)
            else:
                key_position = key_position_temp

            if key_position is None:
                key_position = ''

            key_person = {
                GsModel.KeyPerson.KEY_PERSON_NAME:
                key_person_name.replace(" ", ""),
                GsModel.KeyPerson.KEY_PERSON_POSITION: key_position,
            }
            lst_key_person.append(key_person)

        if len(lst_key_person) > 0:
            key_person_info_dict[GsModel.KEY_PERSON] = lst_key_person

        return key_person_info_dict
예제 #10
0
def detail_chapter(test_url, host_url):
    detail_dicts = []
    header = {
        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1 Edg/85.0.4183.83',
        'referer': test_url,
    }
    text = requests.get(test_url, headers=header).text
    comic_title = PyQuery(text)('.detail-main-info-title').text()
    for ch in r'\/:|<.・>?*"':
        comic_title = comic_title.replace(ch, '㇑')  # 去除特殊字符
    for li_tag in PyQuery(text)('.detail-list-select li'):
        chapter = PyQuery(li_tag)('a').text()
        for ch in r'\/:|<.・>?*"':
            chapter = chapter.replace(ch, '㇑')  # 去除特殊字符
        a_href = PyQuery(li_tag)('a').attr('href')
        detail_dict = {
            'chapter': convert(chapter, 'zh-hans'),
            'a_href': host_url + a_href,
            'comic_title': convert(comic_title, 'zh-hans'),
        }
        detail_dicts.append(detail_dict)
    return detail_dicts
예제 #11
0
    def save_result(self, paste_id, paste_txt, file, directory):
        paste_url = self.PASTESRAW_URL + (paste_id if paste_id[0] == '/' else
                                          '/' + paste_id)
        fn, ext = os.path.splitext(os.path.split(file)[1])
        timestamp = get_timestamp()

        if paste_txt == '':
            content = urllib.request.urlopen(paste_url).read().strip()
            paste_txt = PyQuery(content)('#paste_code').text()
            #paste_txt = PyQuery(url=paste_url)('#paste_code').text()
        if fn == 'base64' and len(paste_txt) > 20:
            codes = ''
            r = re.findall(r'[\w\d+/=]{30,}', paste_txt)
            if r:
                for c in r:
                    if len(c) > len(codes):
                        codes = c
            try:
                i = (4 - len(codes) % 4) % 4
                if 0 < i < 2:
                    codes += "=" * i
                decodes = base64.b64decode(codes).strip()
                paste_txt = paste_txt.replace(codes, decodes).strip()
                if not re.search('\w+', paste_txt, flags=re.I):
                    paste_txt = ''
            except:
                pass
        else:
            paste_txt = paste_txt + os.linesep

        if paste_txt != '':
            self.validpastes += 1
            with open(file, 'a') as matching:
                matching.write(fn + '-' + timestamp + '-' + paste_url +
                               os.linesep)
            try:
                os.mkdir(directory)
            except KeyboardInterrupt:
                raise
            except:
                pass
            with open(directory + '/' + fn + '_' + timestamp.replace(
                    '/', '_').replace(':', '_').replace(' ', '__') + '_' +
                      paste_id.replace('/', '') + '.txt',
                      mode='w') as paste:
                paste.write(paste_txt)
예제 #12
0
 def verify_token(self):
     for x in range(5):
         try:
             WebDriverWait(self.driver, wait).until(
                 EC.element_to_be_clickable((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-setting > div > div.card-body.d-flex.flex-column > div.overflow-container.flex-grow-1 > ul > li")))
             HTML = self.driver.find_elements_by_css_selector(".card-control")[
                         0].get_attribute("innerHTML")
             Doc = PQ(HTML)
             Doc = Doc('.list-group-item-action').text()
             Doc = Doc.replace(" ", "\n")
             Doc = Doc.split("\n")
             # print(Doc)
             path = Doc.index("表名")
             pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]'
             self.driver.find_element_by_xpath(pathh).click()
             break
         except:
             time.sleep(3)
     return "dp_ads." + Doc[path+1]
예제 #13
0
def get_content(url, file_path=''):
    content_html = Pq(url=url)
    # 获取首条卡片
    content = content_html('.content')[0]
    # 获取卡片内容
    text_html = Pq(content).children('.txt')
    # 如果有两个卡片内容,那么说明有展开全文,需要获取后一个,全部文字版的
    text = Pq(text_html[0]).text() if text_html.length == 1 else Pq(text_html[1]).text()
    # 删除一些尾部文字、换行、空格
    text = text.replace('收起全文d', '').replace('O抽奖详情', '').replace('0网页链接', '').replace('\n', '').replace(' ', '')
    # 视频类内容删除尾部视频链接文字
    text = re.sub(r'L.*?的微博视频|L.*?的秒拍视频', '', text)
    txt = ' '.join(jieba.cut(text, cut_all=False, HMM=True))
    print(text)
    print(txt)
    print('\n')
    # 第二个参数传了就保存文本到文件
    if file_path != '':
        with open(file_path, "wb") as fp:
            fp.write(txt.encode())
예제 #14
0
    def click_dataset(self,lan):
        #---PyQuery→Xpath---
        for x in range(5):
            try:
                HTML = self.driver.find_elements_by_css_selector(".card-control")[
                            0].get_attribute("innerHTML")
                Doc = PQ(HTML)
                Doc = Doc('.list-group-item-action').text()
                Doc = Doc.replace(" ", "\n")
                Doc = Doc.split("\n")
                # print(Doc)
                path = Doc.index(lan)
                pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]'
                self.driver.find_element_by_xpath(pathh).click()
                break
            except:
                time.sleep(3)

        #對照頁面上的→維度條件
        WebDriverWait(self.driver, wait).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)")))
        check = self.driver.find_element_by_css_selector("body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)").get_attribute("innerText")
        return check 
예제 #15
0
파일: test.py 프로젝트: wdd257/moxing
def analysisPage(response):
    if response.status_code:
        global photos
        photos = re.findall(' zoomfile="(.*?)" ', response.text)  # 图片url
        folderName = PyQuery(response.text)("span#thread_subject").text()  # 标题
        for ch in r'\/:|<>?*"':
            folderName = folderName.replace(ch, ' ⁂ ')  # 去除特殊字符
        formhash = PyQuery(
            response.text)("input[name='formhash']").attr('value')
        print(folderName)
        print(formhash)
        urlPay = PyQuery(
            response.text)("td[class='t_f'] ignore_js_op span a").attr('href')
        # print(urlPay,type(urlPay))
        aid, tid = re.findall(r'(\d+)', urlPay)
        downHtml(response, folderName)  # 下载单页, 以方便观看
        return {
            'folderName': folderName,
            'formhash': formhash,
            'aid': aid,
            'tid': tid
        }
    print('当前网络不可用')
    return None
예제 #16
0
    def run(self):
        '''
        解析网站源码
        '''
        time.sleep(random.uniform(1.0, 3.6))
        try:
            pq = helper.get(self.url, myHeaders=self.headers)
            # 款型名称
            name = pq('div.product-brand').text().strip() + ' ' + pq('h1.product-name').text().strip()
            # 颜色尺寸
            # 找出所有的尺寸
            size_span_list = pq('div.product-sizes__options span.product-sizes__detail')
            size_price_list = []
            for size_span in size_span_list:
                size = PyQuery(size_span).find('span.product-sizes__size').text().strip()
                if 'K' in size or 'k' in size or '-' in size or 'XS' in size:
                    continue
                size = re.sub(r'[WwYyCc\*]', '', size)
                # 还有非数字的size,醉了
                if size == 'S':
                    continue
                elif size == 'M':
                    continue
                elif size == 'L':
                    continue
                elif size == 'XL':
                    continue
                elif size == 'XXL':
                    continue
                elif size == 'XXXL':
                    continue
                elif size == '':
                    continue
                elif size == 'OS':
                    continue
                price = PyQuery(size_span).find('span.product-sizes__price').text().strip()
                if price.startswith('$'):
                    price = price.replace('$', '').replace(',', '')
                    size_price_list.append({
                        'size': size,
                        'price': float(price),
                        'isInStock': True
                    })
                else:
                    size_price_list.append({
                        'size': size,
                        'price': 0.0,
                        'isInStock': False
                    })
            if len(size_price_list) < 1:
                return
            # 配色的编号
            number = ''
            # 性别
            gender = 0
            # 颜色
            color_value = ''
            tr_list = pq('table#product-attribute-specs-table tr')
            for tr in tr_list:
                key = PyQuery(tr).find('th').text().strip()
                if key == 'Gender':
                    gender_txt = PyQuery(tr).find('td').text().strip()
                    if gender_txt == 'Mens':
                        gender = 1
                    elif gender_txt == 'Womens':
                        gender = 2
                elif key == 'Colorway':
                    color_value = PyQuery(tr).find('td').text().strip()
                elif key == 'Manufacturer Sku':
                    number = PyQuery(tr).find('td').text().strip()
            # print(name, number, self.url, size_price_list, gender, color_value)
            img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)

            if not img_downloaded:
                img_url = pq('div.product-gallery-image > img')[0].get('src')
                # 下载图片
                result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'stadiumgoods', '%s.jpg' % number))
                if result == 1:
                    # 上传到七牛
                    qiniuUploader.upload_2_qiniu('stadiumgoods', '%s.jpg' % number, './imgs/stadiumgoods/%s.jpg' % number)
                img_downloaded = True
            mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], gender, color_value, 'stadiumgoods', '5b8f484b299207efc1fb0904', self.crawl_counter, img_downloaded=img_downloaded)
        except:
            global error_detail_url
            error_counter = error_detail_url.get(self.url, 1)
            error_detail_url[self.url] = error_counter + 1
            helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'stadiumgoods')
            if error_counter < 3:
                self.q.put(self.url)
예제 #17
0
    def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()

        active = True

        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar)
            if len(json['items_html'].strip()) == 0:
                break

            refreshCursor = json['min_position']
            tweets = PyQuery(json['items_html'])('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()

                usernameTweet = tweetPQ(
                    "span.username.js-action-profile-name b").text()
                # txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'));
                temp_txt = PyQuery(
                    re.sub(
                        r'</span><span class="js-display-url">|</span><span class="invisible">',
                        '', str(tweetPQ("p.js-tweet-text")))).text()
                txt = re.sub(
                    r"\s+", " ",
                    temp_txt.replace('# ',
                                     '#').replace('@ ',
                                                  '@').replace('…', '...'))
                retweets = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                dateSec = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweetPQ.attr("data-tweet-id")
                permalink = tweetPQ.attr("data-permalink-path")
                user_id = int(
                    tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                geo = ''
                geoSpan = tweetPQ('span.Tweet-geo')
                if len(geoSpan) > 0:
                    geo = geoSpan.attr('title')
                urls = []
                for link in tweetPQ("a"):
                    try:
                        urls.append((link.attrib["data-expanded-url"]))
                    except KeyError:
                        pass
                tweet.id = id
                tweet.permalink = 'https://twitter.com' + permalink
                tweet.username = usernameTweet

                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.formatted_date = datetime.datetime.fromtimestamp(
                    dateSec).strftime("%a %b %d %X +0000 %Y")
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = re.compile('(@\\w*)').findall(tweet.text)
                tweet.hashtags = re.compile('(#\\w*)').findall(tweet.text)
                tweet.geo = geo
                tweet.urls = urls
                tweet.pics = re.compile('(pic.twitter.com/\\S+)').findall(
                    tweet.text)
                tweet.author_id = user_id

                # links = tweet.urls + tweet.pics
                # for link in links:
                # 	txt = txt.replace(link, '')
                # tweet.text = txt
                #
                # temp_mentions = []
                # for m in tweet.mentions:
                # 	temp_mentions.append(m[1:len(m)])
                # tweet.mentions = temp_mentions
                # temp_hashtags = []
                # for h in tweet.hashtags:
                # 	temp_hashtags.append(h[1:len(h)].lower())
                # tweet.hashtags = temp_hashtags

                results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
예제 #18
0
def scan_proxy_qiaodm():
    """
    扫描代理资源
    :return:
    """
    import requests
    from pyquery import PyQuery as Pq

    source_site = 'http://ip.qiaodm.com/'

    header = {
        'Host':
        'ip.qiaodm.com',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
    }

    s = requests.session()
    # 抓取页面
    file_html = s.get(source_site).content

    # 保存文件
    # with open('test.html', 'a') as f:
    #     f.write(file_html.encode('utf-8'))
    #
    # # 读取抓取的页面
    # with open('test.html', 'r') as f:
    #     file_html = f.read()

    text_pq = Pq(file_html)
    tr_list = text_pq('tbody').find('tr[style="text-align: center;"]')
    print '单页共 %s 条记录' % len(tr_list)
    for tr_item in tr_list:
        # print Pq(tr_item).html()
        # print('---------------------')
        td_list = Pq(tr_item).find('td')
        # print '单条共 %s 列字段' % len(td_list)
        field_list = []
        for td_item in Pq(td_list):
            field = Pq(td_item).text()
            field_list.append(field)
            # print field
            # print('++++++++++++++++++')

        # 特殊处理ip地址
        ip = Pq(td_list).eq(0).html()
        # 去除干扰信息
        ip = html.replace_html(ip, r'<p style="display:none;"/>')
        ip = html.replace_html(ip, r'<p style="display: none;"/>')
        ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>')
        # 去除标签
        ip = html.strip_html(ip)
        # print ip
        # 过滤掉非法ip地址
        if len(ip.split('.')) != 4:
            continue

        # 特殊处理端口
        port_key = Pq(td_list).eq(1).attr('class').split()[1]
        if port_key not in PortDict:
            print '发现新端口: %s' % port_key
            continue
        port = PortDict.get(port_key, '')

        ProsyItem['Ip'] = ip.replace(' ', '')
        ProsyItem['Port'] = port
        ProsyItem['Type'] = field_list[2].strip()
        ProsyItem['AnonymousDegree'] = field_list[3].strip()
        ProsyItem['Area'] = field_list[4].strip()
        ProsyItem['Speed'] = field_list[5].strip()
        ProsyItem['ScanTime'] = field_list[6].strip()
        # print ProsyItem
        proxy_item = json.dumps(ProsyItem, ensure_ascii=False)
        html.save_file('proxy.json', proxy_item + '\n', 'a')
예제 #19
0
파일: a.py 프로젝트: niu2x/novelpush
	server.sendmail(mailFrom, rcptToList, message.as_string())
	server.quit()

if '__main__' == __name__:
	configFile = 'config.cfg'
	novels = PyQuery(filename = configFile)
	message = ''
	for novel in novels('novel'):
		name = PyQuery(novel)('name').text()
		url = PyQuery(novel)('url').text()
		prefix = PyQuery(novel)('prefix').text()
		next = int(PyQuery(novel)('next').text())
		rcptToList = []
		for addr in PyQuery(novel)('emails>email'):
			rcptToList.append(PyQuery(addr).text())
		print rcptToList
		html = PyQuery(url = url)
		nextUrl = None
		for i in html('div.threadlist_title.pull_left.j_th_tit.member_thread_title_frs > a.j_th_tit'):
			if i.text.find(number2chinese(next)) != -1:
				nextUrl = prefix + PyQuery(i).attr('href')
				break
		if nextUrl:
			next += 1
			PyQuery(novel)('next').text(str(next))
			text = PyQuery(url=nextUrl)('cc:first > div:first').html()
			text = text.replace(u'<br/>', '\n').strip()
			subject = name + u' ' + u'第'+unicode(str(next))+u'章'
			send_mail('*****@*****.**', rcptToList, subject.encode('utf8'), text.encode('utf8'))
	open(configFile, 'wt').write(str(novels))
예제 #20
0
파일: proxy.py 프로젝트: gasongjian/python
def scan_proxy():
    """
    扫描代理资源
    :return:
    """
    import requests
    from pyquery import PyQuery as Pq

    source_site = 'http://ip.qiaodm.com/'

    header = {
        'Host': 'ip.qiaodm.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
    }

    s = requests.session()
    # 抓取页面
    file_html = s.get(source_site).content

    # 保存文件
    # with open('test.html', 'a') as f:
    #     f.write(file_html.encode('utf-8'))
    #
    # # 读取抓取的页面
    # with open('test.html', 'r') as f:
    #     file_html = f.read()

    text_pq = Pq(file_html)
    tr_list = text_pq('tbody').find('tr[style="text-align: center;"]')
    print '单页共 %s 条记录' % len(tr_list)
    for tr_item in tr_list:
        # print Pq(tr_item).html()
        # print('---------------------')
        td_list = Pq(tr_item).find('td')
        # print '单条共 %s 列字段' % len(td_list)
        field_list = []
        for td_item in Pq(td_list):
            field = Pq(td_item).text()
            field_list.append(field)
            # print field
            # print('++++++++++++++++++')

        # 特殊处理ip地址
        ip = Pq(td_list).eq(0).html()
        # 去除干扰信息
        ip = html.replace_html(ip, r'<p style="display:none;"/>')
        ip = html.replace_html(ip, r'<p style="display: none;"/>')
        ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>')
        # 去除标签
        ip = html.strip_html(ip)
        # print ip
        # 过滤掉非法ip地址
        if len(ip.split('.')) != 4:
            continue

        # 特殊处理端口
        port_key = Pq(td_list).eq(1).attr('class').split()[1]
        if port_key not in PortDict:
            print '发现新端口: %s' % port_key
            continue
        port = PortDict.get(port_key, '')

        ProsyItem['Ip'] = ip.replace(' ', '')
        ProsyItem['Port'] = port
        ProsyItem['Type'] = field_list[2].strip()
        ProsyItem['AnonymousDegree'] = field_list[3].strip()
        ProsyItem['Area'] = field_list[4].strip()
        ProsyItem['Speed'] = field_list[5].strip()
        ProsyItem['ScanTime'] = field_list[6].strip()
        # print ProsyItem
        proxy_item = json.dumps(ProsyItem, ensure_ascii=False)
        html.save_file('proxy.json', proxy_item + '\n', 'a')