Пример #1
0
class Pep8(object):
    starting = 'http://pep8online.com'
    form_action = 'http://pep8online.com/checkresult'

    def __init__(self):
        self.g = Grab()
        self.redis = StrictRedis(**REDIS_CFG)

    def pep8(self, tested_file='file.py', ignore_codes=None, ignore_type=''):
        return self.get_and_filter(self.query_file(tested_file), ignore_codes,
                                   ignore_type)

    def pep8s(self, tested_string, ignore_codes=None, ignore_type=''):
        if ignore_codes is None:
            ignore_codes = []
        return self.get_and_filter(self.query_string(tested_string),
                                   ignore_codes, ignore_type)

    def get_and_filter(self, result, ignore_codes=None, ignore_type=''):
        if ignore_codes is None:  # in fact it is never modified, but to be sure
            ignore_codes = []
        return filter(
            lambda problem: problem[0][:1] not in ignore_type and problem[0]
            not in ignore_codes, result)

    def query_file(self, tested_file):
        with open(tested_file) as f:
            content = f.read()
        return self.query_string(content)

    def query_string(self, tested_string):
        redis_key = md5.md5(tested_string).hexdigest()
        msg = 'Served from '
        starting = time.time()
        redis_value = self.redis.get(redis_key)
        if redis_value:
            msg += 'redis'
            problems = ujson.loads(redis_value)
        else:
            msg += 'pep8online.com'
            self.g.go(self.starting)  # except grab.error.GrabTimeoutError
            gff = self.g.doc.form_fields()
            gff['code'] = tested_string
            self.g.go(self.form_action, post=gff)
            problems = self.parse_response()
            self.redis.set(redis_key, ujson.dumps(problems))
        log_it = self.redis.get('config_log')
        if log_it is None or log_it == '1':
            msg += ', %.3f seconds.' % (time.time() - starting)
            print(msg)
        return problems

    def parse_response(self):
        problems = []
        for problem in self.g.doc.tree.cssselect('.tr-result'):
            tds = problem.findall('td')
            problems.append(
                (tds[0].text_content().strip(), tds[1].text.strip(),
                 tds[2].text.strip(), tds[3].text.strip()))
        return problems
Пример #2
0
    def test_useragent_simple(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Simple case: setup user agent manually
        g.setup(user_agent='foo')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['user-agent'], 'foo')
Пример #3
0
    def test_cookiefile(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Empty file should not raise Exception
        open(TMP_FILE, 'w').write('')
        g.setup(cookiefile=TMP_FILE)
        g.go(SERVER.BASE_URL)

        cookies = {'spam': 'ham'}
        json.dump(cookies, open(TMP_FILE, 'w'))

        # One cookie are sent in server reponse
        # Another cookies is passed via the `cookiefile` option
        SERVER.RESPONSE['cookies'] = {'godzilla': 'monkey'}
        g.setup(cookiefile=TMP_FILE)
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'spam=ham')

        # This is correct reslt of combining two cookies
        MERGED_COOKIES = {'godzilla': 'monkey', 'spam': 'ham'}

        # g.config should contains merged cookies
        self.assertEqual(set(MERGED_COOKIES.items()),
                         set(g.config['cookies'].items()))

        # `cookiefile` file should contains merged cookies
        self.assertEqual(set(MERGED_COOKIES.items()),
                         set(json.load(open(TMP_FILE)).items()))
Пример #4
0
    def test_useragent_simple(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Simple case: setup user agent manually
        g.setup(user_agent='foo')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['user-agent'], 'foo')
Пример #5
0
 def get_source_page(self, search_text):
     """Getting a source page by given search parameter"""
     grab = Grab()
     grab.go(u"https://ya.ru/")
     grab.doc.set_input(u"text", search_text)
     grab.doc.submit()
     return grab.response.body
Пример #6
0
def get_data(url):
    '''
    Getting data(price and offers href) from Yandex Realt with client parameters
    '''
    #print(url)

    price_list = []
    href_list = []

    g = Grab()
    g.go(url)

    # search html class with price
    data_list = g.xpath_list('//*[@class="serp-item__price"]')
    total = 0
    for p in data_list:
        price = price_format(p.text_content())
        total += price
        price_list.append(price)

    # search html class with href
    data_list = g.xpath_list(
        '//*[@class="link link_redir_yes stat__click i-bem"]')
    for h in data_list:
        href_list.append(h.get('href'))

    if len(price_list) != 0:
        aver_price = total / len(price_list)
        return aver_price, href_list
    else:
        return 0, []
Пример #7
0
def month_trips():
    # todo: how/where to get this cookie? need to build good userSearchConfiguration row
    from grab import Grab
    g = Grab()
    url = 'https://booking.pobeda.aero/AjaxMonthLowFareAvailaibility.aspx'
    headers = {
        'Cookie': 'ASP.NET_SessionId=21ct5f2osvv5y2bv1ixw3n1f; \
               CultureCode=%7b%22Value%22%3a%22ru-RU%22%7d; \
               skysales=!Nnf0/LJnRyPnqRB26j6ok5cv2bHz0cdeINHGmEqHIJBCFRJfwAK5n253Fan5AJd7uagOX1WR3QCitE8=; \
               PassengersInfoCookie=%7b%22Value%22%3a%22%22%7d; \
               dtSa=-; \
               userSearchConfiguration=%7B%22From%22%3A%22VKO%22%2C%22InboundDate%22%3A%222017-12-11%22%2C%22To%22%3A%22SGC%22%2C%22OutboundDate%22%3A%222017-12-08%22%2C%22MinADT%22%3A0%2C%22MinCHD%22%3A0%2C%22MinINFT%22%3A0%2C%22SelectedADT%22%3A%221%22%2C%22SelectedCHD%22%3A%220%22%2C%22SelectedINFT%22%3A%220%22%2C%22MaxPax%22%3A0%2C%22TripType%22%3A%22RoundTrip%22%2C%22LinkBooking%22%3Anull%2C%22MinDepartureDate%22%3Anull%2C%22MaxDepartureDate%22%3Anull%2C%22MinArrivalDate%22%3Anull%2C%22MaxArrivalDate%22%3Anull%2C%22Culture%22%3A%22ru%22%2C%22CurrencyCode%22%3A%22RUB%22%2C%22Success%22%3Atrue%2C%22AnyFieldWithData%22%3Afalse%7D; \
               dtPC=3$496961232_348h-vCRDIHUPXBDJPBHJGJILFKIAEOODOHLHSOQ; \
               dtCookie=3$2B3EAF72CD66D7CD233F16EA2C693C8A|RUM+Default+Application|1; \
               rxVisitor=1511890404724BSGLKFH5U7NNCUC0MQG38QMSEJ0S3PV1; \
               rxvt=1512498768913|1512496961238; \
               dtLatC=2; \
               sessionControl=%7B%22ownership%22%3A%7B%22sessionOwnerId%22%3A%22213d9100-e07d-85d5-8342-7bab90906705%22%2C%22sessionOwnerPage%22%3A%22https%3A%2F%2Fbooking.pobeda.aero%2FScheduleSelect.aspx%22%2C%22lastUpdated%22%3A1512497216699%7D%7D',
        'Origin': 'https://booking.pobeda.aero',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': '*/*',
        'Referer': 'https://booking.pobeda.aero/ScheduleSelect.aspx',
        'X-Requested-With': 'XMLHttpRequest',
        'Connection': 'keep-alive'
    }
    data = 'indexTrip=1&dateSelected=2017-12-08'

    g.go(url, headers=headers)  # , get=data)
    g.doc.cookies.items()
    """
Пример #8
0
    def f(self):
        g = Grab(log_file='out.log')
        g.go(BASE_URL)

        c = CCmbParser()
        c.soup = g.doc.body
        g.doc.set_input('search_type', 'ADDRESS')
        c.soup = g.doc.body
        a = c.get_subjects()
        self.fillComboBox(self.cmbSubject, a)
        g.doc.set_input('subject_id', '130000000000')
        g.doc.set_input('region_id', '145286000000')
        g.doc.set_input('settlement_id', '145298578000')
        #g.doc.set_input('subject_id', '130000000000')
        #g.doc.set_input('124000000000', 'checked="true"')
        c.soup = g.doc.body
        b = c.get_regions()
        c1 = c.get_street_type()
        d = c.get_city_types()
        e = c.get_cities()
        # Ищем слово "Новости" на странице
        # print(u"На этой странице есть слово \"Новости\"? %s" % u'Да' if g.doc.text_search(u'Новости') else u'Нет')
        # выводим тайтл страницы
        print(u"Заголовок страницы: '%s'" % g.doc.select('//title').text())

        g.doc.set_input('search_type', 'ADDRESS')

        f = g.doc.submit()
        print 'zi'
        pass
Пример #9
0
def download():
    url=input('url:')
    print('ddd')
    pool = ThreadPool(4)
    main_page=Grab()
    main_page.go(url)
    title=main_page.doc('//*[@id="novel_color"]/p').text()
    if os.path.isdir('temp'):
       shutil.rmtree('temp')
    os.mkdir('temp')
    path='temp/'
    main_page.doc.save(path+'main.html')
    urls_xpath = main_page.doc('//*[@id="novel_color"]/div/dl/dd/a')
    i=1
    urls=[]
    dict={}
    for url in urls_xpath:
        urls.append('http://novel18.syosetu.com'+ url.attr('href'))
        key= url.attr('href').split('/')[2]
        dict[key]=url.text().replace('/',' ')
    print(dict)
    def pages(page_url):
        num=page_url.split('/')[4]
        f = urllib.request.urlopen(page_url)
        data = f.read()
        with open(path+'%04u.%s.html'%(int(num),dict[num]), "wb") as code:
            code.write(data)
        print(num,dict[num])
    pool.map(pages,urls)
Пример #10
0
def match_history(first_team, second_team, n_last_match=5):
    """
    Функция которая возвращает таблицу из n_last_match последних
    матчей между командами first_team и second_team
    """
    URI = "http://www.sportzone.ru/sport/search.html?com1={0}&com2={1}&sport=1&rows={2}&search=%CF%EE%E8%F1%EA".format(urllib.quote_plus(first_team.encode('cp1251')), urllib.quote_plus(second_team.encode('cp1251')), n_last_match)
    g = Grab(log_file="/tmp/graber.log")
    g.go(URI)
    if g.search(u'Ранее не встречались'):
        return []
    else:
        try:
            all_table = g.xpath('/html/body/table[4]/tr/td[2]/table')
        except IndexError:
            return []
        answer = []
        for y in all_table:
            a_temp = []
            for x in y:
                if x.text == None:
                    temp = x.xpath('a')
                    if len(temp) > 0 and temp[0].text != None:
                        a_temp.append(temp[0].text)
                else:
                    a_temp.append(x.text)
            answer.append(a_temp)
    return answer
Пример #11
0
def parse_find(html):
    g = Grab()
    g.setup(timeout=30, connect_timeout=20)
    g.go(html)
    soup = BeautifulSoup((g.response.body), 'lxml')
    information = soup.find_all('div', {'class' : 'externalRating-item'})
    reload_name = []
    for i in list(information):
        reload_name.append((str(i.text)))
    list_reit= str(reload_name).split(',')
    reit = {}
    reit['IMDb'] = list_reit[1][1:5]
    if reit == ("']"):
        reit['IMDb'] = ' Отсутствует'

    name = str(soup.find('h1', {'class': 'view__title'}).text).replace('\n','').strip().replace('\xa0', ' ')
    actors = str(soup.find_all('ul', {'class' : 'items'})[0].text).replace(' \n\n',', ')
    producers = str(soup.find_all('ul', {'class' : 'items'})[1].text).replace('\n\n','')
    year =soup.find_all('div', {'class' : 'infoi__content'})[1].text
    year=str(year).replace('\n','').replace(' ','')
    genre = soup.find_all('div', {'class' : 'infoi__content'})[3].text
    genre = str(genre).replace(' ','').replace('\n',' ')
    inf_ab_film = {}
    inf_ab_film['name'] = name
    inf_ab_film['genre'] = genre
    inf_ab_film['age'] =year
    act = str(producers) + str(actors).replace('\n','')
    inf_ab_film['actors'] =  act
    inf_ab_film['reit'] = str(reit['IMDb'])[1:]
    return inf_ab_film
Пример #12
0
def loadPage(url, i):
    g = Grab()
    g.setup(log_dir='grab')
    g.setup(timeout=150, connect_timeout=100)
    # g.setup(proxy='186.170.31.134:8080', proxy_type='http')
    # qs = urlencode({'samo_action':'PRICES',
    # 'TOWNFROMINC':'101',
    # 'STATEINC':'33',
    # 'TOURTYPE':'0',
    # 'TOURINC':'963',
    # 'CHECKIN_BEG':'20160721',
    # 'NIGHTS_FROM':'2',
    # 'CHECKIN_END':'20160722',
    # 'NIGHTS_TILL':'10',
    # 'ADULT':'2',
    # 'CURRENCY':'1',
    # 'CHILD':'0',
    # 'TOWNTO_ANY':'1',
    # 'TOWNTO':'',
    # 'STARS_ANY':'1',
    # 'STARS':'',
    # 'hotelsearch':'0',
    # 'HOTELS_ANY':'1',
    # 'HOTELS':'',
    # 'MEAL':'',
    # 'FREIGHT':'0',
    # 'FILTER':'0',
    # 'HOTELTYPES':'',
    # 'PACKET':'1',
    # 'PRICEPAGE':i})
    # g.go(url + qs)
    g.go(url)
    body = g.response.body
    response = body[body.find('<table'):len(body)]
    return response
Пример #13
0
def download_manga(link, path, zipping_type):
    g = Grab()
    g.go(link, log_file="manga.html")

    for item in g.doc.select('//div[@class="expandable chapters-link"]//table//a/@href'):
        chapter = link[:link.rfind('/')] + item._node + '?mature=1'
        download_chapter(chapter, path, zip=zipping_type)
Пример #14
0
def main():
    print('\n-- Парсинг афиши Тетра оперы и балета -  ' + str(datetime.datetime.now()))
    opera = Grab(document_charset='utf-8', timeout=20, connect_timeout=20)
    opera.go('http://komiopera.ru/index.php?option=com_content&view=article&id=95&Itemid=134')
    #opera.response.body = clean_text(opera.response.body, 'normal')

    dates = opera.doc.select('//table//table//tr/td[1]/div/b')
    titles = opera.doc.select('//table//table//tr/td[2]/div/b')
    contents1 = opera.doc.select('//table//table//tr/td[2]/div/i')
    contents2 = opera.doc.select('//table//table//tr/td[3]/div/b')
    times = opera.doc.select('//table//table//tr/td[3]/div')

    date_for_db = data_change(dates[0].text(), 'komiopera')    
    exist_date_event = last_date_event('komiopera', date_for_db)
    for date, title, content1, content2, time in zip(dates, titles,contents1, contents2, times):
        if exist_date_event.count(data_change(date.text(), 'komiopera')):
            print(data_change(date.text(), 'komiopera') + ' уже есть')
        else:
            event = {
                'name': title.text().strip(),
                'date': data_change(date.text(), 'komiopera'),
                'time': time.text()[-5:],
                'type_event': 'teatr',
                'type_film': '',
                'price': 0,
                'source_id': 6, #коми опера
                'description': content1.text().strip() + ', ' + content2.text().strip(),
                'poster': ''
            }
            write_event_to_db(event)
Пример #15
0
def parse():

    g = Grab()
    base_url = 'https://www.buzzfeed.com'
    appendix_1 = '/?p='
    topics = ['world', 'politics', 'business', 'lgbt', 'tech', 'science', 'music', 'animals', 'travel', 'style', 'sports']

    data = {}
    for topic in topics:
        articles_list = []
        for page in range(1, 10):
            time.sleep(0.2)
            g.go(base_url + '/' + topic + appendix_1 + str(page))
            urls = getPageUrls(g.response.body)
            for url in urls:
                g.go(base_url + url)
                article = getArticle(g.response.body)
                if len(article) > 1:
                    articles_list.append(article)
        data.update({topic: articles_list})

    data_size = 0
    for topic in data.keys():
        data_size += len(data[topic])
    print "{} articles in {} topics".format(data_size, len(data))
Пример #16
0
    def proxy_validation(proxy: str, ptype: str, web_site: str = 'https://whoer.net/') -> bool:
        """
        Проверка живучести прокси.
        :param proxy: IP:PORT прокси
        :param ptype: Тип прокси
        :param web_site: сайт который будет загружен для проверки соединения
        :return:
        """
        if proxy is None:
            return True

        try:
            from grab import Grab
            from grab import GrabError
        except ImportError:
            print('For validation proxy needed Grab!')
            return True

        g = Grab()
        g.setup(proxy=proxy,
                proxy_type=ptype,
                connect_timeout=5,
                timeout=60)

        try:
            print('Starting check proxy url={!r}'.format(web_site))
            g.go(web_site)
        except GrabError:
            print('Proxy {!r} is dead'.format(proxy))
            return False
        else:
            print('Proxy {!r} is live'.format(proxy))
            return True
Пример #17
0
def add_advert():
    print("Add new advertisement.")
    g = Grab(log_file="2.html")
    g.load_cookies('cookies.txt')
    g.go("http://m.avito.ru/add")
    #login_test()
    from selenium.webdriver import Firefox
    from selenium.webdriver.common.keys import Keys
    import selenium
    from PIL import Image

    browser = Firefox()
    driver = selenium.webdriver.Firefox()

    browser.get('http://m.avito.ru/profile/login')
    driver.implicitly_wait(10)
    elem = driver.find_element_by_css_selector(".control-self.control-self-email")
    elem.send_keys("*****@*****.**")
    """
    driver.find_element_by_name("password")
    element.send_keys("ivveqaem")
    driver.find_element_by_class_name("control-self control-self-submit button button-solid button-blue button-large")
    driver.find_element_by_partial_link_text("Войти")
    element.send_keys(Keys.ENTER)
"""
    #browser.get('http://m.avito.ru/add')
    browser.save_screenshot('current_page')
    current_page_img = Image.open('current_page')
    w, h = current_page_img.size
    captcha_img = current_page_img#.crop((575, 505, w-155, h-1820))
    captcha_img.save('captcha.jpg', 'jpeg')
Пример #18
0
def scan_it_remote(request):
    if request.user.is_admin:
        siteremote = "http://hotline.gowius.com/addtask/"
        site = '20k.com.ua'

        id = int(request.POST.get('id', 0))

        category = Category.objects.filter(pk=id)

        if category:

            cat = category.get()

            items = ColorProduct.objects.filter(product__category=cat)
            result = []
            # scan = ScanHotline(category=cat, items=items)
            # scan.save()
            for item in items:
                if item.href and item.price > 0 and item.hrefok:
                    result.append({'id': item.id, 'url': item.href})

            g = Grab()
            g.setup(post={'url': site, 'items': json.dumps(result)})
            g.go(siteremote)

    return HttpResponse('ok')
Пример #19
0
        def get_mag(link):
            idmag = link.split('/')

            # assert False, idmag

            firm = FirmHotline.objects.filter(itemid=int(idmag[2]))
            if firm:

                self.stdout.write("Firm exists:" + firm[0].name)
                return firm[0]
            else:
                fg = Grab(log_dir="/tmp", timeout=30)
                fg.go('http://hotline.ua' + link)
                body = fg.response.unicode_body()
                pyquery = pq(body)
                name = pyquery('ul.shop-title > li > h1').text()
                try:
                    link = pyquery(
                        'ul.info-shops > li > p > a')[0].attrib['href']
                except:
                    link = ""

                firm = FirmHotline(itemid=int(idmag[2]), name=name, url=link)
                firm.save()

                self.stdout.write("New Firm:" + firm.name)
                return firm
Пример #20
0
def get_weather(message):
    """."""
    cities = {'dnepr': 'dnipropetrovsk-5077',
              'kyiv': 'kyiv-4944',
              'lviv': 'lviv-4949'}
    g = Grab()

    try:
        (city, date) = message.split('-')
    except ValueError:
        return 'Wrong data'

    try:
        g.go('https://www.gismeteo.ua/weather-{city}/'.format(
             city=cities[city]))
    except KeyError:
        return 'Wrong city'
    select_now = g.doc.select('//div[@id="weather"]'
                              '//div[contains(@class, "temp")]'
                              '//dd[contains(@class, "c")]').text()
    select_tommorow = ''.join([
        g.doc.select('//div[@id="tab_wdaily2"]'
                     '//div[contains(@class, "temp")]'
                     '//span[contains(@class, "c")]').text(),
        ' - ',
        g.doc.select('//div[@id="tab_wdaily2"]'
                     '/div[contains(@class, "temp")]'
                     '/em/span[contains(@class, "c")]').text()])

    temperature = select_tommorow if date == '2' else select_now

    return temperature.encode('utf-8')
Пример #21
0
def main():
    added = 0
    limit = 100
    count = 1
    stack = [base_url]
    doc_links = open("Links.txt", "w")

    for site in stack:
        stack_link = site
        print(str(added) + " - " + stack_link)

        g = Grab(log_file='html.html')
        g.go(stack_link)
        time.sleep(0.1)

        doc_file = open("docs/%i.txt" % added, "w")
        doc_file.write(g.doc.select("//*").text())
        doc_file.close()
        doc_links.write(str(added) + " - " + stack_link + '\n')

        added += 1
        if count == limit:
            continue
        for link in g.doc.select('//a'):
            try:
                href = base_url + link.attr("href")
            except DataNotFound:
                continue
            if count == limit:
                break
            if stack.count(href) == 0 and is_link_valid(href):
                stack.append(href)
                count += 1

    doc_links.close()
Пример #22
0
def param():
    from grab import Grab
    KOL = 1  #КОЛИЧЕСТВО ЦИКЛОВ СКРИПТА
    for m in range(KOL):
        g = Grab()
        g.go('http://192.168.100.6:10002/login/WebVision/ses_Fithness/')
        g.set_input('user', 'root')
        g.set_input('pass', 'GfhjkmKf;f123')
        g.submit()
        g.go(
            'http://192.168.100.6:10002/WebVision/ses_Fithness4/pg_so/pg_6/pg_mn/pg_1?com=attrsBr&tm'
        )
        name1, name2, full, mass1 = [], [], [], []
        for elem in g.doc.select('//w[*]/el[@id="text"]'):
            name1.append(elem.text())  #Холодилка название параметров
        for elem in g.doc.select('//w[*]/el[@id="arg0val"]'):
            mass1.append(elem.text())  #Холодилка значения
    global full2
    name1 = [x for x in name1 if x != '%1']  #Удаляем ненужные значения
    name1 = (name1[2], name1[3], name1[8], name1[9], name1[29], name1[49],
             name1[32], name1[33], name1[4], name1[31], name1[60]
             )  #Собираем названия
    mass1 = (mass1[0], mass1[1], mass1[4], mass1[5], mass1[9], mass1[30],
             mass1[10], mass1[11], mass1[3], mass1[12], mass1[39]
             )  # Собираем значения
    full = [a + " = " + str(b)
            for a, b in zip(name1, mass1)]  #В список 2 списка
    return full
Пример #23
0
def get_links(page, grab_=None):
    if grab_ is None:
        grab_ = Grab()
    grab_.go(page)
    return [
        'http://tagbrand.com%s' % link.attr('href')
        for link in grab_.doc.select('//dl[*]/dd/p[1]/a')]
Пример #24
0
class UltimateRewardsGrabber:

    def __init__(self):
        self.g = Grab()

    def grab(self):
        self.g.go(BASE_URL)
        divs = self.g.doc.select('//div[contains(@class, "mn_srchListSection")]')
        for div in divs:
            try:
                merchants = div.text().split('/$')
                for merchant in merchants:
                    merchant = merchant.split('Details ')[1]
                    title = ' '.join(merchant.split(' ')[:-2])
                    cost = merchant.split(' ')[-2]
                    print title, ' - ', cost
            except IndexError:
                pass
            merchant = models.Item(title=title, cost=cost)
            db.session.add(merchant)
        db.session.commit()


    def save(self):
        pass
Пример #25
0
def transya():
    # Перевод на русский язык. Спасибо яндекс переводчику ! :)
    g_translate_img_alt = Grab()
    g_translate_img_title = Grab()
    g_translate_description = Grab()
    count = 0
    max_index = len(Topics.href)
    for count in range(9):
        # Переводим описание изображения
        resp_front_img_alt_ru = json.loads(g_translate_img_alt.go(
            'https://translate.yandex.net/api/v1.5/tr.json/translate?key=trnsl.1.1.20170514T220842Z.5b2c14ecd7990670.3ccb355751262f1359f3c3ff0b9b7d5447ce39a1',
            post={"text": Topics.curi_tr_get_topic_front_img_alt()[count], 'lang': 'en-ru',
                  'format': 'plain'}).unicode_body(ignore_errors=True, fix_special_entities=True))
        Topics.front_img_alt_ru.append(str(resp_front_img_alt_ru))
        # Переводим титул постов
        resp_title_ru = json.loads(g_translate_img_title.go(
            'https://translate.yandex.net/api/v1.5/tr.json/translate?key=trnsl.1.1.20170514T220842Z.5b2c14ecd7990670.3ccb355751262f1359f3c3ff0b9b7d5447ce39a1',
            post={"text": Topics.curi_tr_get_topic_title()[count], 'lang': 'en-ru', 'format': 'plain'}).unicode_body(
            ignore_errors=True, fix_special_entities=True))
        Topics.title_ru.append(str(resp_title_ru["text"]))
        # Переводим расширенное описание поста
        resp_descriptions_ru = json.loads(g_translate_img_alt.go(
            'https://translate.yandex.net/api/v1.5/tr.json/translate?key=trnsl.1.1.20170514T220842Z.5b2c14ecd7990670.3ccb355751262f1359f3c3ff0b9b7d5447ce39a1',
            post={"text": Topics.get_descriptions()[count], 'lang': 'en-ru', 'format': 'plain'}).unicode_body(
            ignore_errors=True, fix_special_entities=True))
        Topics.descriptions_ru.append(str(resp_descriptions_ru["text"]))
        count = count + 1
Пример #26
0
def SaveImageYandex(text, imageCount, path, w='800', h='600'):
    global prefix
    prefix += 1
    g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language':    'ru,en;q=0.8'})
    query = urllib.urlencode({'text': text.encode('utf-8'), 'iw': w, 'ih': h})
    url = 'http://images.yandex.ru/yandsearch?isize=gt&itype=jpg&'+query
    g.go(url)
    image_number = 0
    f2 = open('out.txt', 'a')
    filename = str(prefix) + '-' + StringForFilename(text) + '.jpg'
    f2.write(filename + '\n')
    f2.close()
    while image_number < imageCount:
        image_number += 1
        tmp = g.doc.select('//html/body/div[2]/div/div[2]/div[2]/div[1]/div[contains(@class, "b-images-item")]['
                           + str(image_number) + ']').attr('onclick')
        match = re.search(r'"fullscreen":\{"url":"(.*?)"', tmp)
        if match:
            image_URL = match.group(1)
            print str(image_number) + '. ' + image_URL
            ext = GetFileExtFromURL(image_URL)
            filename = str(prefix) + '-' + StringForFilename(text) + '-' + str(image_number) + '.jpg'
            try:
                patht = os.path.join(path, filename)
                print patht
                urllib.urlretrieve(image_URL, patht)
            except:
                pass
        else:
            print 'Cant find image for this query ' + str(image_number)
Пример #27
0
def just_print(mark):
    g = Grab()
    g.go(mark.url)
    body = g.doc.tree
    title = body.xpath('//*/head/title/text()')
    description = body.xpath('//*/meta[@name="description"]/@content')

    if title == []:
        title = u'Странно, но заголовок отстутствует'
    else:
        title = title[0]

    if description == []:
        description = body.xpath('//*/meta[@property="og:description"]/@content')
        if description == []:
            description = u'Описание отсутствует'
        else:
            description = description[0][0:200]
    else:
        description = description[0][0:200]

    p = re.compile("(.*\.\w{2,3})/")
    res = p.findall(mark.url)[0]
    favicon = res+'/favicon.ico'
    print('message from task')

    mark.title = title
    mark.description = description
    mark.favicon = favicon
    mark.save()
Пример #28
0
def prepare_and_create_grab(url):

    cache_name = split_url_by_volume_and_chapter(url)
    dir_name = cache_name[0]
    file_name = cache_name[1] + '.html'
    file_path = os.path.join(generate_info_ranobe.DIR_RANOBE, 'cache', dir_name, file_name)
    data = None

    if not os.path.exists(os.path.dirname(file_path)):
        os.makedirs(os.path.dirname(file_path))

    if not os.path.exists(file_path):
        g = Grab()
        g.go(url)
        with open(file_path, mode='w', encoding='utf8') as f:
            text = g.response.body
            f.write(text)
            if not data:
                data = text

    if not data:
        with open(file_path, encoding='utf8') as f:
            data = f.read()

    return Grab(data)
Пример #29
0
def get_mag(link):
    idmag = link.split('/')

    firm = FirmHotline.objects.filter(itemid=int(idmag[2]))
    if firm:

        print("Firm exists:" + firm[0].name)
        return firm[0]
    else:
        fg = Grab(log_dir="/tmp", timeout=300)

        fg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                          "/proxy1.txt",
                          source_type='text_file',
                          proxy_type='http',
                          auto_change=True)
        fg.go('http://hotline.ua' + link)
        body = fg.response.body
        pyquery = pq(body)
        name = pyquery('ul.shop-title > li > h1').text()
        try:
            link = pyquery('ul.info-shops > li > p > a')[0].attrib['href']
        except:
            link = ""

        firm = FirmHotline(itemid=int(idmag[2]), name=name, url=link)
        firm.save()

        print("New Firm:" + firm.name)
        return firm
Пример #30
0
    def rozparse(self):
        g = Grab()
        names = []
        prices = []
        count = 1
        paginator = []
        res = {}
        g.go('http://rozetka.com.ua/stabilizers/c144719/')

        for i in g.doc.select('//ul[@name="paginator"]/li[@class="paginator-catalog-l-i"]/a'):
            paginator.append(i.text())


        while count < (int(paginator[-1])+1):
            g.go('http://rozetka.com.ua/stabilizers/c144719/page=' + str(count) + '/')
            for title in g.doc.select('//div[@class="g-i-tile-i-box-desc"]/div[@class="g-i-tile-i-title clearfix"]'):
                names.append(title.text())
            for i in g.doc.select('//div[@class="g-i-tile-i-box-desc"]'):
                prices.append(json.loads(parse.unquote(i.text().split('"')[1]))['price'])
            print(count)
            count += 1

        for key, value in zip(names, prices):
            res[key] = value


        print(res['Электромир Volter СНПТО 18пт'])
        return print(res)
Пример #31
0
 def test_body_maxsize(self):
     g = Grab(transport=GRAB_TRANSPORT)
     g.setup(body_maxsize=100)
     SERVER.RESPONSE['get'] = 'x' * 1024 * 1024
     g.go(SERVER.BASE_URL)
     # Should be less 50kb
     self.assertTrue(len(g.response.body) < 50000)
Пример #32
0
def get_course_gold():
    url = "https://pwcats.info/servers/scorpio"
    g = Grab()
    g.go(
        url,
        user_agent=
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/62.0.3202.94 '
        'YaBrowser/17.11.1.990 Yowser/2.5 Safari/537.36')
    pay_list = g.doc.select(
        '/html/body/div[1]/div/div/div[2]/aside/table[1]/tbody/tr[*]/td[1]/text()'
    ).node_list()
    sale_list = g.doc.select(
        '/html/body/div[1]/div/div/div[2]/aside/table[1]/tbody/tr[*]/td[2]/text()'
    ).node_list()
    # print(pay_list[0])
    # print(pay_list[0].replace(' ', ''))
    for i in range(0, pay_list.__len__()):
        string = pay_list[i].replace(' ', '')
        string = string.replace('\n', '')
        index = string.find('(')
        pay_list[i] = string[0:index]
    for i in range(0, sale_list.__len__()):
        str_sale = sale_list[i].replace(' ', '')
        str_sale = str_sale.replace('\n', '')
        index = str_sale.find('(')
        sale_list[i] = str_sale[0:index]
    return "Продают по " + min(pay_list) + '\nCкупают по ' + max(sale_list)
Пример #33
0
def login_test():
    g = Grab(log_file="1.html")
    g.go("http://m.avito.ru/profile")
    g.doc.set_input("login","login")
    g.doc.set_input("password","password")
    g.doc.submit()
    g.cookies.save_to_file('cookies.txt')
Пример #34
0
def start():

    CSVFile(header=['Artist', 'Album', 'Genre', 'Style', 'Year', 'Rating'])
    page = 1
    page_not_found = None
    while page_not_found == None:

        try:
            print('Page', page)

            pitchfork_page = Grab()
            pitchfork_page.go(PITC_URL + str(page))
            soup = Soup(pitchfork_page.doc.select('//div[@id="main"]/ul[@class="object-grid "]').html(), 'lxml')
            albums_on_page = []

            for link in soup.find_all('a', href=True):
                albums_on_page.append('http://pitchfork.com' + link['href'])

            pool = ThreadPool(THREADS)

            pool.map(pitchfork, albums_on_page)

            page += 1

            # if page > 1:
            #   page_not_found = True

        except IndexError as error:
            print(error)
            page_not_found = True
Пример #35
0
def parse(last_page=1):
    i = 1
    print('Last page is {0}'.format(last_page))
    for x in range(1, last_page + 1):
        main_domain = 'http://4pda.ru/page/{0}/'.format(x)

        g = Grab()
        g.go(main_domain)
        nodes = g.doc.select('//article[@class="post"]').node_list()
        if nodes:
            try:
                f = open('4pda.csv', 'x')
                writer = csv.writer(f)
                writer.writerow(['№', 'Заголовок', 'Дата публикации', 'Ссылка'])
            except FileExistsError:
                f = open('4pda.csv', 'a')
                writer = csv.writer(f)
            finally:
                for n, node in enumerate(nodes):
                    header = node.xpath('//div[@class="description"]//h1//span')
                    links = node.xpath('//div[@class="description"]//h1//a')
                    dates = node.xpath('//div//div//div//em')
                    writer.writerow([
                        i,
                        header[n].text,
                        dates[n].text,
                        links[n].attrib['href']
                    ])
                    i += 1
                f.close()
                print(x)
        else:
            return 'Posts not found'
    return 'Job done.'
Пример #36
0
def getModelLink(modelName):
    g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language':    'ru,en;q=0.8'})
    url = 'http://market.yandex.ru/'
    g.go(url)
    try:
        paginatorHTML = g.doc.select(popt['pagination']).html()
        pagesLinks = GetAllLinksFromString(paginatorHTML, url)
    except:
        pagesLinks = []
    pagesLinks.append(url)
    pagesLinks = list(set(pagesLinks))
    pagesCount = pagesLinks.__len__()
    newPagesCount = 1
    while pagesCount != newPagesCount:
        lastPage = pagesLinks.__len__() - 1
        url = pagesLinks[lastPage]
        g.go(url)
        try:
            paginatorHTML = g.doc.select(popt['pagination']).html()
            newlinks = GetAllLinksFromString(paginatorHTML, url)
        except:
            newlinks = []
        for newlink in newlinks:
            pagesLinks.append(newlink)
        pagesLinks = list(set(pagesLinks))
        newPagesCount = pagesLinks.__len__()
    return pagesLinks
Пример #37
0
def get_every_day():
    global caption
    global date_post
    url = "https://pp.userapi.com/"
    g = Grab()
    g.go(
        "https://vk.com/skorpw",
        user_agent=
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 '
        'YaBrowser/17.11.1.990 Yowser/2.5 Safari/537.36')
    # list = g.doc.body.decode('cp1251')
    try:
        image = g.doc.select(
            './/*[@id="public_wall"]/*[@id="page_wall_posts"]/div/div/div[2]/div[1]/div[1]/div[1]/div[2]/a[@aria-label]/@onclick'
        )[0].text()
        caption = 'Ежа'
        date_time = datetime.datetime.now()
        date_post = date_time.date()
        json_string = get_indexes(image)
        res = json.loads(json_string)
        result = res['temp']['y']
        url_image = result
        #url_image=result[0]
        #url_image="http://www.kartinki.me/pic/201506/1920x1200/kartinki.me-21699.jpg"
        return url_image
    except IndexError:
        return None
Пример #38
0
    def process_news_list_page(self, url):
        """Обработка страницы со ссылками на новости"""
        print('Trying open url: {}'.format(url))
        g = Grab(proxy='127.0.0.1:{}'.format(self.socks_port),
                 proxy_type='socks5',
                 timeout=90,
                 connect_timeout=30)
        try:
            g.go(url)
        except GrabNetworkError as e:
            # Ошибка подключения
            print('Connection error: {}'.format(e))
            self.change_identity()  # Используем новую личность
            return url

        # На всякий случай проверим код ответа
        if g.response.code != 200:
            print('Error code: {}'.format(g.response.code))
            self.change_identity()
            return url

        # Проверка, не наткнулись ли мы на капчу
        captcha = g.doc.select(self.captcha_xpath).text_list()
        if captcha:
            print('Captcha found: {}, setting new identity'.format(captcha[0]))
            self.change_identity()  # Используем новую личность
            return url

        # Поиск ссылок на новости
        news_links = g.doc.select(self.links_xpath).text_list()
        for news_link in news_links:
            founded_url = news_link.replace(
                re.findall('.+//[^/]+', news_link)[0],
                '')  # Потому что для загрузки надо передавать урл без домена
            pattern = self.config.get('detailed_page_re')
            match = re.search(
                pattern, founded_url
            )  # Дополнительная проверка на совпадение полученного урла регулярке
            if match:
                print('Found link: {}, from {}'.format(founded_url, self.url))
                # Передаем в очередь для загрузки
                retrieve_information.apply_async(args=(self.config,
                                                       founded_url),
                                                 kwargs={
                                                     'related_from_url': None,
                                                     'region': self.region_name
                                                 },
                                                 priority=251)

        # Поиск ссылки для перехода на следующую страницу
        next_page = g.doc.select(self.next_page_xpath).text_list()
        if next_page:
            if self.requests_delay is not None:
                sleep(
                    self.requests_delay)  # Чтобы не быть слишком настойчивыми
            # Переход на следующую страницу
            return '{}{}'.format(self.base_url, next_page[0])

        # Не было ошибки, и не было найдено ссылки на очередную страницу
        return False
Пример #39
0
def parse_data_from_url(report_id):
    logging = get_task_logger('parse')
    locale.setlocale(
        locale.LC_TIME, 'ru_RU.UTF-8'
    )  #Пока не проверял, как работает с локалью сам Джанго, потом протестирую
    report = Report.objects.get(id=report_id)
    report.pages_amount = get_pages_amount(report_id)
    report.title = get_title(report_id)
    weekly = json.loads(report.weekly)
    hourly = json.loads(report.hourly)
    for page in range(1, report.pages_amount + 1):
        url = report.assemble_url() + '?page={page}'.format(page=page)
        g = Grab(log_file='page_out.html')
        try:
            g.go(url)
        except GrabTimeoutError:
            continue
        page_urls = g.doc.select(
            '//*[@id="offers_table"]//*[@data-cy="listing-ad-title"]/@href'
        ).text_list()
        for page_url in page_urls:
            dt = get_page_datetime(page_url)
            if dt:
                hourly[str(dt.hour)] += 1
                weekly[dt.strftime('%A')] += 1
    report.hourly = json.dumps(hourly)
    report.weekly = json.dumps(weekly)
    report.save()
    report.send_mail()
Пример #40
0
    def check_following(self, url, token_id, loyalty_id):
        follow = False
        self.refresh_token(token_id)
        soc_token = SocToken.query.get(token_id)
        action = PaymentLoyalty.query.get(loyalty_id)
        target = json.loads(action.data)

        g = Grab()
        g.setup(headers={'Authorization': 'Bearer ' + soc_token.user_token})
        url_api = self.API_PATH + self.API_PARTS['subscriptions']

        while not follow:
            g.go(url_api)
            subscriptions = json.loads(g.response.body)

            if 'items' not in subscriptions:
                break
            if len(subscriptions['items']) <= 0:
                break

            for subscribe in subscriptions['items']:
                if 'snippet' in subscribe and 'channelId' in subscribe['snippet'] and subscribe['snippet']['channelId'] == target['channelId']:
                    follow = True

            if 'nextPageToken' not in subscriptions:
                break
            if len(subscriptions['nextPageToken']) <= 0:
                break

            url_api = "%s%s&pageToken=%s" % (
                self.API_PATH,
                self.API_PARTS['subscriptions'],
                subscriptions['nextPageToken'])

        return follow
Пример #41
0
def main():
    default_logging()
    for x in xrange(500):
        url = 'http://load.local/grab.html'
        g = Grab()
        g.go(url)
        assert 'grab' in g.response.body
Пример #42
0
def parse_famous(year, month, day):
    '''
    parse famous from famousbirthdays.com
    by month day
    year is ignore now
    '''
    months = get_months()
    url = 'http://www.famousbirthdays.com/%s%d.html' % (months[month], day)

    g = Grab()
    g.setup()
    g.go(url)

    elements = g.doc.select('//ul[@class="top-celebrity-col4 col1"]/li')
    list = []

    for element in elements:
        src = element.node.getchildren()[1].getchildren()[0].getchildren()[0].get('src')
        age = element.node.getchildren()[2].getchildren()[0].text_content().split(' ')[-1]
        name = element.node.getchildren()[2].getchildren()[0].getchildren()[0].text_content()
        description = element.node.getchildren()[2].getchildren()[1].text_content()

        list.append({'src': src, 'name': name, 'age': age, 'description': description})

    return list
Пример #43
0
    def test_empty_useragent_pycurl(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Empty string disable default pycurl user-agent
        g.setup(user_agent='')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers'].get('user-agent', ''), '')
Пример #44
0
Файл: pars.py Проект: esha-/esca
def get_data(url):
    '''
    Getting data(price and offers href) from Yandex Realt with client parameters
    '''
    #print(url)

    price_list = []
    href_list = []

    g = Grab()
    g.go(url)

    # search html class with price
    data_list = g.xpath_list('//*[@class="serp-item__price"]')
    total = 0
    for p in data_list:
        price = price_format(p.text_content())
        total += price
        price_list.append(price)
    
    # search html class with href
    data_list = g.xpath_list('//*[@class="link link_redir_yes stat__click i-bem"]')
    for h in data_list:
        href_list.append(h.get('href'))

    if len(price_list) != 0:
        aver_price = total / len(price_list)
        return aver_price, href_list
    else:
        return 0, []
Пример #45
0
    def clean_url(self):
        url = self.cleaned_data['url']
        url_regex = 'https?:\/\/diesel.elcat.kg\/index.php\?showtopic=([\d]+).*'
        compiled_url_regex = re.compile(url_regex)
        urls = compiled_url_regex.findall(url)
        if urls:
            real_url = 'http://diesel.elcat.kg/index.php?showtopic=' + urls[0]
            g = Grab()
            g.setup(connect_timeout=30, timeout=60)

            try:
                g.go(url=real_url)
            except:
                raise forms.ValidationError(u'Что то пошло не так.')

            html = g.response.body
            mes_regex = '<div class="([\w]+)" id=\\\'([\w]+)-([\d]+)\\\'>'
            compiled_mes_regex = re.compile(mes_regex)
            topics = compiled_mes_regex.findall(html)
            if not topics:
                raise forms.ValidationError(u'Топик не существует или удален.')

            return real_url
        else:
            raise forms.ValidationError(u'Не коректный url!')
Пример #46
0
 def test_nobody(self):
     g = Grab(transport=GRAB_TRANSPORT)
     g.setup(nobody=True)
     SERVER.RESPONSE['get'] = 'foo'
     g.go(SERVER.BASE_URL)
     self.assertEqual('', g.response.body)
     self.assertTrue(len(g.response.head) > 0)
Пример #47
0
 def run(self):
     global rining
     global success
     global missedCount
     if rining:
         g = Grab()
         g.setup(hammer_mode=True,
                 hammer_timeouts=((10, 15), (20, 30), (60, 80)))
         #g.load_proxylist('proxy.lst', 'text_file', proxy_type='http', auto_init=False, auto_change=True)
         try:
             g.go(uri)
         except Exception:
             print "\n[!] No valid proxy or network error ...\n"
             rining = 0
             sys.exit(1)
         for i in range(len(words)):
             if rining and not success:
                 sleep(1)
                 nextword = getword()
                 value = last + nextword
                 try:
                     self.bot(value, nextword, g)
                 except Exception:
                     print "\n[!] Network error ...\n"
                     rining = 0
                     sys.exit(1)
         rining = 0
Пример #48
0
 def test_body_maxsize(self):
     g = Grab(transport=GRAB_TRANSPORT)
     g.setup(body_maxsize=100)
     SERVER.RESPONSE['get'] = 'x' * 1024 * 1024
     g.go(SERVER.BASE_URL)
     # Should be less 50kb
     self.assertTrue(len(g.response.body) < 50000)
Пример #49
0
def loadPage(url, adult, child, country, i):
    print 'run Grab'
    g = Grab()
    g.setup(log_dir='tcc_tayland_2_1')
    g.setup(timeout=250, connect_timeout=200)
    g.setup(proxy='220.101.93.3:3128', proxy_type='http')
    qs = urlencode({'samo_action':'PRICES',
    'TOWNFROMINC':'101',
    'STATEINC':country_op,
    'TOURTYPE':'0',
    'TOURINC':'0',
    'CHECKIN_BEG':'20160731',
    'NIGHTS_FROM':'2',
    'CHECKIN_END':'20160831',
    'NIGHTS_TILL':'10',
    'ADULT':adult,
    'CURRENCY':'2',
    'CHILD':child,
    'TOWNTO_ANY':'1',
    'TOWNTO':'',
    'STARS_ANY':'1',
    'STARS':'',
    'hotelsearch':'0',
    'HOTELS_ANY':'1',
    'HOTELS':'',
    'MEAL':'',
    'FREIGHT':'0',
    'FILTER':'0',
    'HOTELTYPES':'',
    'PACKET':'1',
    'PRICEPAGE':i})
    print (url + qs)
    g.go(url + qs)
    body = g.response.body
    return body
Пример #50
0
def main():
    default_logging()
    for x in xrange(500):
        url = 'http://load.local/grab.html'
        g = Grab()
        g.go(url)
        assert 'grab' in g.response.body
Пример #51
0
    def test_empty_useragent_pycurl(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Empty string disable default pycurl user-agent
        g.setup(user_agent='')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers'].get('user-agent', ''), '')
Пример #52
0
    def test_cookiefile(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Empty file should not raise Exception
        open(TMP_FILE, 'w').write('')
        g.setup(cookiefile=TMP_FILE)
        g.go(SERVER.BASE_URL)

        cookies = [{'name': 'spam', 'value': 'ham'}]
        json.dump(cookies, open(TMP_FILE, 'w'))

        # One cookie are sent in server reponse
        # Another cookies is passed via the `cookiefile` option
        SERVER.RESPONSE['cookies'] = {'godzilla': 'monkey'}
        g.setup(cookiefile=TMP_FILE)
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['cookies']['spam'].value, 'ham')

        # This is correct reslt of combining two cookies
        MERGED_COOKIES = [('godzilla', 'monkey'), ('spam', 'ham')]

        # g.cookies should contains merged cookies
        self.assertEqual(set(MERGED_COOKIES),
                         set(g.cookies.items()))

        # `cookiefile` file should contains merged cookies
        self.assertEqual(set(MERGED_COOKIES),
                         set((x['name'], x['value']) for x in json.load(open(TMP_FILE))))
Пример #53
0
def test_khest():
    ua = UserAgent()
    grab = Grab(timeout=30,
                log_file='%s/vparser/tmp/pars/log.html' %
                os.path.split(PROJECT_PATH)[0])
    grab.setup(proxy='46.148.30.250:8080',
               proxy_type='http',
               proxy_userpwd=CREDENTIALS)  # , log_dir='vparser/tmp'
    grab.go(
        'http://kharkov.kha.slando.ua/obyavlenie/sdam-gostinku-tsentr-vse-udobstva-ID75tep.html#13fed9ae6e;promoted'
    )
    # grab.go('http://kharkov.kha.slando.ua/nedvizhimost/arenda-kvartir/')

    # ff = grab.doc.select('//div[@class="pricelabel tcenter"]')
    # print ff.text()
    # for f in get_adv_on_page(grab):
    #     print f
    # g.setup(cookies={u'domain': u'secure.e-konsulat.gov.pl', u'name':
    #                             u'MSZ', u'value': u'64e8734b-986c-4cd4-be44-b2c112ec49c8', u'expiry':
    #                                 '1362046140', u'path': u'/', u'secure': 'False'})
    # print get_adv_photo(grab)

    # for dd in grab.doc.select('//div[@class="pding5_10"]'):
    #     if dd.text().split(':')[0] == u'Количество комнат':
    #         print dd.text().split(':')[1]

    phones = get_phone(grab)

    # print len(phones)

    slando = Slandos()
    # for phone in phones:
    #     print phone
    moder_phone(phones, slando)
Пример #54
0
    def test_load_dump(self):
        g = Grab(transport=GRAB_TRANSPORT)
        cookies = {'foo': 'bar', 'spam': 'ham'}
        g.setup(cookies=cookies)
        g.go(SERVER.BASE_URL)
        g.dump_cookies(TMP_FILE)
        self.assertEqual(set(cookies.items()),
                         set((x['name'], x['value']) for x in json.load(open(TMP_FILE))))

        # Test non-ascii
        g = Grab(transport=GRAB_TRANSPORT)
        cookies = {'foo': 'bar', 'spam': u'бегемот'}
        g.setup(cookies=cookies)
        g.go(SERVER.BASE_URL)
        g.dump_cookies(TMP_FILE)
        self.assertEqual(set(cookies.items()),
                         set((x['name'], x['value']) for x in json.load(open(TMP_FILE))))

        # Test load cookies
        g = Grab(transport=GRAB_TRANSPORT)
        cookies = [{'name': 'foo', 'value': 'bar'},
                   {'name': 'spam', 'value': u'бегемот'}]
        json.dump(cookies, open(TMP_FILE, 'w'))
        g.load_cookies(TMP_FILE)
        self.assertEqual(set(g.cookies.items()),
                         set((x['name'], x['value']) for x in cookies))
Пример #55
0
def translate(word, key, lan1='en', lan2='ru', alt=True, syn=True):
    """Prints the number of counts, word, translation, and example
    from lan1 to lan2 according to Translate.Google."""
    # First, write down a translation in some auxiliary txt file
    # and load it in json format
    g = Grab(log_file = 'dict.txt')
    link = 'http://translate.google.ru/translate_a/t?client=x&text='\
           + word + '&sl=' + lan1 + '&tl=' + lan2
    g.go(link)
    data = json.load(open('dict.txt'))
    # Then, let's try to get all the necessary elements in json
    translation, noun, alternatives, synonims = 0, 0, 0, 0
    try:
        translation = data[u'sentences'][0][u'trans']
        noun = data[u'dict'][0][u'pos']
        alternatives = data['dict'][0]['terms']
        synonims = data['dict'][0]['entry'][0]['reverse_translation']
    except:
        pass
    # German nouns should begin with capital letter
    if lan1=='de' and noun==u'имя существительное':
        word = word.title()
    # Finally, print out counts, word, translation with alternatives
    # and synonims, if applicable. Encoding is added up to allow
    # printing in cmd if you have a russian version of Windows
    if translation:
        print ('['+str(key)+']', word, ': ', translation)
        if alt and alternatives:
            [print (i, end=', ') for i in alternatives]
            print ('\r')
        if syn and synonims:
            [print (i.encode('cp866', errors='replace'), end=', ')
                                     for i in synonims]
            print ('\n')
Пример #56
0
    def test_cookiefile(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Empty file should not raise Exception
        open(TMP_FILE, 'w').write('')
        g.setup(cookiefile=TMP_FILE)
        g.go(SERVER.BASE_URL)

        cookies = {'spam': 'ham'}
        json.dump(cookies, open(TMP_FILE, 'w'))

        # One cookie are sent in server reponse
        # Another cookies is passed via the `cookiefile` option
        SERVER.RESPONSE['cookies'] = {'godzilla': 'monkey'}
        g.setup(cookiefile=TMP_FILE)
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'spam=ham')

        # This is correct reslt of combining two cookies
        MERGED_COOKIES = {'godzilla': 'monkey', 'spam': 'ham'}

        # g.config should contains merged cookies
        self.assertEqual(set(MERGED_COOKIES.items()),
                         set(g.config['cookies'].items()))

        # `cookiefile` file should contains merged cookies
        self.assertEqual(set(MERGED_COOKIES.items()),
                         set(json.load(open(TMP_FILE)).items()))
Пример #57
0
def getModelLink(modelName):
    g = Grab(connect_timeout=5,
             userpwd='user:pass',
             debug_post='True',
             log_dir='log',
             headers={'Accept-Language': 'ru,en;q=0.8'})
    url = 'http://market.yandex.ru/'
    g.go(url)
    try:
        paginatorHTML = g.doc.select(popt['pagination']).html()
        pagesLinks = GetAllLinksFromString(paginatorHTML, url)
    except:
        pagesLinks = []
    pagesLinks.append(url)
    pagesLinks = list(set(pagesLinks))
    pagesCount = pagesLinks.__len__()
    newPagesCount = 1
    while pagesCount != newPagesCount:
        lastPage = pagesLinks.__len__() - 1
        url = pagesLinks[lastPage]
        g.go(url)
        try:
            paginatorHTML = g.doc.select(popt['pagination']).html()
            newlinks = GetAllLinksFromString(paginatorHTML, url)
        except:
            newlinks = []
        for newlink in newlinks:
            pagesLinks.append(newlink)
        pagesLinks = list(set(pagesLinks))
        newPagesCount = pagesLinks.__len__()
    return pagesLinks
Пример #58
0
 def test_nobody(self):
     g = Grab(transport=GRAB_TRANSPORT)
     g.setup(nobody=True)
     SERVER.RESPONSE['get'] = 'foo'
     g.go(SERVER.BASE_URL)
     self.assertEqual('', g.response.body)
     self.assertTrue(len(g.response.head) > 0)
Пример #59
0
def getproduct(href, item, scan, oneposition):
    pg = Grab(log_dir="/tmp", timeout=30)

    # proxy = Proxy.objects.filter(active=True).order_by('?')[0]

    # # proxyaddrlist = proxy.name.split(':')[0:1]
    # # proxyuserlist = proxy.name.split(':')[2:3]

    # proxyaddr = proxy.name.split(':')[0] + ':' + proxy.name.split(':')[1]
    # proxyuser = proxy.name.split(':')[2] + ':' + proxy.name.split(':')[3]

    # print proxyaddr

    # # assert False, proxyuser

    # pg.setup(proxy=proxyaddr, proxy_userpwd=proxyuser, proxy_type="http")

    pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                      "/proxy1.txt",
                      source_type='text_file',
                      proxy_type='http',
                      auto_change=True)

    # print pg.config['proxy']

    # pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt", source_type='text_file', proxy_type='http', auto_change=True)
    try:
        purl = "http://hotline.ua" + href
        pg.go(purl)
        # pass
    except Exception, e:
        print "Error: " + purl

        return
Пример #60
0
    def f(self):
        g = Grab(log_file='out.log')
        g.go(BASE_URL)

        c = CCmbParser()
        c.soup = g.doc.body
        g.doc.set_input('search_type', 'ADDRESS')
        c.soup = g.doc.body
        a = c.get_subjects()
        self.fillComboBox(self.cmbSubject, a)
        g.doc.set_input('subject_id', '130000000000')
        g.doc.set_input('region_id', '145286000000')
        g.doc.set_input('settlement_id', '145298578000')
        #g.doc.set_input('subject_id', '130000000000')
        #g.doc.set_input('124000000000', 'checked="true"')
        c.soup = g.doc.body
        b = c.get_regions()
        c1 = c.get_street_type()
        d = c.get_city_types()
        e = c.get_cities()
        # Ищем слово "Новости" на странице
        # print(u"На этой странице есть слово \"Новости\"? %s" % u'Да' if g.doc.text_search(u'Новости') else u'Нет')
        # выводим тайтл страницы
        print(u"Заголовок страницы: '%s'" % g.doc.select('//title').text())

        g.doc.set_input('search_type', 'ADDRESS')

        f = g.doc.submit()
        print 'zi'
        pass