def sp_headers():
    if __name__ == "__main__":
        header = Headers(
            browser="chrome",  # Generate only Chrome UA
            os="win",  # Generate ony Windows platform
            headers=True  # generate misc headers
        )

        for i in range(10):
            header.generate()
예제 #2
0
def gosreestr_parse_new_uids(fpath,
                             existed_uids,
                             timeout,
                             error_timeout,
                             luigi_callback=None):
    page_index = 0
    s = requests.Session()
    headers = Headers(headers=True)

    _existed_uids = existed_uids

    if os.path.exists(fpath):
        parsed_uids = [u.split(';')[0] for u in read_lines(fpath)]
        page_index = int(read_lines(fpath).pop().split(';')[1]) + 1
        _existed_uids.extend(parsed_uids)

    form_data = prepare_request_data(FORM_DATA, page_index)
    s.headers = headers.generate()
    table_raw = s.post(LIST_URL, data=form_data, timeout=15).text
    status = ''
    new_uids_count = 0
    new_uids = list()
    while not check_empty_table(table_raw):
        uids = parse_ids_from_table(table_raw)
        _new_uids = list()
        for uid in uids:
            if uid not in _existed_uids:
                _new_uids.append(uid)
                append_file(fpath, f'{uid};{page_index}')
            else:
                break

        new_uids.extend(_new_uids)
        new_uids_count += len(_new_uids)

        form_data = prepare_request_data(FORM_DATA, page_index)

        try:
            s.headers = headers.generate()
            table_raw = s.post(LIST_URL, data=form_data, timeout=15).text
        except (ReadTimeout, ConnectTimeout, ConnectionError,
                ReadTimeoutError):
            luigi_callback(
                f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after error',
                0)
            sleep(error_timeout)
        else:
            page_index += 1
            luigi_callback(
                f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after success.',
                0)
            sleep(timeout)

    return new_uids
예제 #3
0
def get_header():
    header = Headers(
        browser="chrome",  
        os="win", 
        headers=True  
    )
    return header.generate()
예제 #4
0
    def get_more_suggestions(request_text):
        df = request_text.split(" ")
        finSug = []
        for el in df:
            newurl = 'https://wbxsearch.wildberries.ru/suggests/common?query=REQUEST'.replace(
                "REQUEST", str(el))

            session = requests.Session()
            headers = Headers(browser="chrome", os="win", headers=True)
            session.headers = headers.generate()

            lst_req_text = request_text.split(" ")

            res = session.get(url=newurl)
            res.raise_for_status()
            suggestions = json.loads(res.text)

            for item in suggestions:
                vector_sg = str(item["name"]).split(" ")
                for inItem in vector_sg:
                    for el in lst_req_text:
                        if inItem.find(el) != -1:
                            finSug.append(inItem)

        return finSug
예제 #5
0
 def parse(self, response):
     try:
         header = Headers(
             browser="chrome",  # Generate only Chrome UA
             os="win",  # Generate ony Windows platform
             headers=True  # generate misc headers
         )
         header1 = ""
         for i in range(1, 10):
             header1 = header.generate()
         print(len(listing_urls))
         for i in range(0, len(listing_urls)):
             yield scrapy.Request(url=listing_urls[i],
                                  callback=self.parse_data,
                                  meta={
                                      'listing_url': listing_urls[i],
                                      'thumb_urls': thumb_urls[i],
                                      'categories': categories[i],
                                      'buying_format': buying_format[i],
                                      'titles': titles[i]
                                  },
                                  dont_filter=True,
                                  headers=header1)
     except Exception as e:
         print(e)
예제 #6
0
    def get_links(self):
        """
        returns list of all chapter's link from https://mangareader.cc/
        """
        ua = Headers(headers=False)  #change headers

        urllib3.disable_warnings(
            urllib3.exceptions.InsecureRequestWarning)  #hiding the warning

        response = requests.get(
            self.URL, headers=ua.generate(), verify=False
        )  #sending a request and storing the response inside response var

        if response.status_code >= 400 and response.status_code < 500:  #if server error
            print("Server Error\nTry again later")
        if response.status_code >= 200 and response.status_code < 300:
            soup = BeautifulSoup(response.content, "html.parser")

            unorder_list = soup.findAll("ul")[2]
            all_hyperlink_tags = unorder_list.findChildren('a')
            all_hrefs = list(
                reversed([
                    hyperlink.get('href') for hyperlink in all_hyperlink_tags
                ]))

            return all_hrefs
예제 #7
0
    def get_chapter_list(self):
        """
        returns list of all chapter's from given manga/
        """
        ua = Headers(headers=False)  #change headers

        urllib3.disable_warnings(
            urllib3.exceptions.InsecureRequestWarning)  #hiding the warning

        response = requests.get(
            self.URL, headers=ua.generate(), verify=False
        )  #sending a request and storing the response inside response var

        if response.status_code >= 400 and response.status_code < 500:  #server error
            print("Server Error!\nTry again later")
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            unorder_lists = soup.findAll("ul")
            all_spans = unorder_lists[2].findChildren('span',
                                                      {'class': 'leftoff'})
            all_chapters = list(
                reversed(
                    list(
                        map(self.remove_trails,
                            [span.text for span in all_spans]))))

            return all_chapters
예제 #8
0
def startsHere():

    header = Headers()

    uheaders = header.generate()

    urls = list(map(lambda x: x["_id"], db["subreddits"].find({})))

    turls = {}
    for url in urls:
        strCollection = url.split("/r/")[-1].split("/")[0]
        collection = db[strCollection]
        try:
            nele = len(list(collection.find({})))
        except Exception as e:
            nele = 0
        turls[url] = nele

    urls = sorted(turls.items(), key=lambda x: x[1])

    for url, _ in urls:
        try:
            print(url)
            ourl = url
            url = url + "new.json?limit=1000"
            scrap(url, uheaders, ourl)
        except Exception as e:
            print(e)
예제 #9
0
async def parse_page(redis_client, url: str, session, netloc: str,
                     spell_checker):
    header = Headers()
    assert spell_checker['pinterest'] == True
    print(f'analyzing {url}')
    async with session.get(url,
                           headers=header.generate(),
                           ssl=False,
                           allow_redirects=True,
                           proxy=random_proxy()) as resp:
        if resp.status in [403, 429]:
            number_of_errors = redis_client.hincrby('4xxerrors', url, 1)
            # TODO: I don't think this is the correct redis location
            if number_of_errors > 3:
                redis_client.srem(f'active:{netloc}')
            return

        soup = BeautifulSoup(await resp.text(), "html.parser")
        visible_words_with_punctuation = get_text(soup)
        pattern = re.compile(r'[\W_]+', re.UNICODE)
        visible_words_strip_punctuation = {
            pattern.sub('', word)
            for word in visible_words_with_punctuation
        }
        wrong_words_set = spell_checker.unknown(
            visible_words_strip_punctuation)
        wrong_words_set_clean = {word for word in wrong_words_set if not ""}
        add_set_to_redis(netloc, url, visible_words_with_punctuation,
                         wrong_words_set_clean, spell_checker, redis_client)

        redis_client.sadd(f'processed:{netloc}', url)
        # this is essentially a recursive search that recalls parse_page() until all the URL's are done
        await extract_and_queue_local_links(soup, netloc, redis_client,
                                            session, spell_checker)
예제 #10
0
def gosreestr_parse_companies(fpath: str, struct=None):

    page_index = 23
    s = requests.Session()
    headers = Headers(headers=True)

    form_data = prepare_request_data(FORM_DATA, page_index)

    table_raw = s.post(LIST_URL, data=form_data).text
    mapping = {
        f.name: f.metadata['label_key']
        for f in attr.fields(GosreestrCompany)
    }

    timeout_error = False

    while not check_empty_table(table_raw):
        ids = parse_ids_from_table(table_raw)
        if not timeout_error:
            for _id in ids:
                url = DETAIL_URL.format(_id)
                try:
                    s.headers = headers.generate()
                    company_raw = s.get(url, timeout=10).text
                except (ReadTimeout, ConnectTimeout, ConnectionError,
                        ReadTimeoutError):
                    print('company request ban')
                    timeout_error = True
                    sleep(90)
                else:
                    timeout_error = False
                d = parse_company_info(company_raw, mapping)
                print(d)
                # sleep(15)
        page_index += 1
        form_data = prepare_request_data(FORM_DATA, page_index)
        sleep(300)
        try:
            s.headers = headers.generate()
            table_raw = s.post(LIST_URL, data=form_data, timeout=10).text
        except (ReadTimeout, ConnectTimeout, ConnectionError,
                ReadTimeoutError):
            print('table request ban')
            timeout_error = True
            sleep(300)
        else:
            timeout_error = False
예제 #11
0
 def on_start(self):
     header = Headers(
         browser="firefox",
         os="linux",
         headers=True  # generate misc headers
     )
     headerNow = header.generate()
     self.client.get("/?q=panda&atb=v183-1&ia=web", headers=headerNow)
예제 #12
0
def test_get_text():
    header = Headers()
    resp = requests.get("http://example.com/", headers=header.generate())
    soup = BeautifulSoup(resp.text, "html.parser")
    correct_resp = ['Example', 'Domain', 'This', 'domain', 'is', 'for', 'use', 'in', 'illustrative', 'examples', 'in',
                    'documents.', 'You', 'may', 'use', 'this', 'domain', 'in', 'literature', 'without', 'prior',
                    'coordination', 'or', 'asking', 'for', 'permission.', 'More', 'information...']
    assert get_text(soup) == correct_resp
예제 #13
0
def test_proxy_connection():
    proxies = config.PROXY_LIST.strip('][').split(', ')
    for proxy in proxies:
        header = Headers()
        proxy_sample = {"http": proxy}
        resp = requests.get("http://example.com/",
                            proxies=proxy_sample,
                            headers=header.generate())
        assert resp.status_code == 200
예제 #14
0
 def start_requests(self):
     header = Headers(
         browser="chrome",  # Generate only Chrome UA
         os="win",  # Generate ony Windows platform
         headers=True  # generate misc headers
     )
     header1 = ""
     for i in range(1, 10):
         header1 = header.generate()
     yield scrapy.Request(self.urls, self.parse, headers=header1)
예제 #15
0
 def header_generator(self):
     """Генерация header'ов"""
     header = Headers()
     headers = header.generate()
     headers["Accept-Language"] = "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7"
     headers["Accept"] = "application/json, text/plain, */*"
     headers["Accept-Encoding"] = "gzip, deflate, br"
     headers["Referer"] = "https://ruz.fa.ru/ruz/main"
     headers["Sec-Fetch-Site"] = "same-origin"
     headers["Sec-Fetch-Mode"] = "cors"
     headers["Sec-Fetch-Dest"] = "empty"
     self.headers = headers
예제 #16
0
def yield_html(url, **kwargs):
    """Yields HTML content(s) to caller."""
    session = httpx.Client()
    strainer = get_cl_strainer()
    # For generating random request headers.
    rand_header = Headers(headers=True)
    try:
        # Single request: a URL string
        if isinstance(url, str):
            yield get_html(
                get_request(session, url, rand_header.generate(),
                            **parse_kwargs(kwargs)).text,
                strainer,
            )
        # Single request: a single URL in a list or tuple
        elif isinstance(url, (list, tuple)) and len(url) == 1:
            yield get_html(
                get_request(session, url[0], rand_header.generate(),
                            **parse_kwargs(kwargs)).text,
                strainer,
            )
        # Multiple requests
        else:
            # Build iterables of session and strainer objects equal in length to URL tuple.
            sessions = make_iterable(session, len(url))
            strainers = make_iterable(strainer, len(url))
            headers = [
                hdr() for hdr in make_iterable(rand_header.generate, len(url))
            ]
            yield from map(
                get_html,
                (response.text for response in threaded_get_request(
                    sessions, url, headers, **parse_kwargs(kwargs))),
                strainers,
            )
    except tenacity.RetryError as error:
        raise ConnectionError(
            "Maximum requests attempted - check network connection."
        ) from error
예제 #17
0
def generate_header(browser='chrome', ops='win', random_args=False, **kwargs):
    """生成随机请求头"""
    header = Headers(
        browser=
        browser,  # str, chrome/firefox/opera. User Agent browser. Default: random
        os=ops,  # str, win/mac/lin. OS of User Agent. Default: random
        headers=
        random_args  # bool, True/False. Generate random headers or no. Default: False
    )
    headers = header.generate()
    for key, value in kwargs.items():
        headers[key] = value
    return headers
예제 #18
0
 def __init__(self,
              request_name,
              New_updated_data=[],
              links_storage=c.Links_storage):
     self.New_updated_data = New_updated_data
     self.redisDB = redis.Redis(db=1)
     self.request_name = request_name
     if len(request_name) == 0:
         return
     self.links_storage = links_storage
     self.session = requests.Session()
     headers = Headers(browser="chrome", os="win", headers=True)
     self.session.headers = headers.generate()
     self.products = {}
예제 #19
0
class APIClient(object):
    def __init__(self):
        self.base_url = "https://cdn-api.co-vin.in/api"
        self.headers = Headers(browser="chrome", os="win", headers=True)

    def session(self):
        self.s = requests.Session()

    def get(self, method, params=None):
        response = self.s.get(self.base_url + method,
                              params=params,
                              verify=False,
                              headers=self.headers.generate())
        return (response.text, response.status_code)
async def parse_page(redis, url: str, session) -> None:
    header = Headers()

    async with session.get(url,
                           headers=header.generate(),
                           ssl=False,
                           allow_redirects=True,
                           proxy=random_proxy()) as resp:
        current_netloc = urlparse(url).netloc
        # Get the url's parent
        try:
            domain = await Domain.query.where(
                Domain.domain == f'http://{current_netloc}').gino.first()
        except Exception as e:
            logging.error(f'Failed at finding {current_netloc}', exc_info=True)

        # Break out 403 errors for multiple tries
        if resp.status in [403, 429]:
            redis.hincrby("403errors", url, 1)
            await redis.srem('domainbeingcrawled:active', current_netloc)
            number_of_errors = await redis.hget('403errors', url)
            number_of_errors = int(number_of_errors.decode('utf8'))
            if number_of_errors >= 5:
                await Page.create(page=url,
                                  errors=[],
                                  page_response=resp.status,
                                  domain=domain.id)
                await redis.srem('pagestobecrawled:queue', url)

            return
        soup = BeautifulSoup(await resp.text(), "html.parser")
        visible_words = get_text(soup)
        wrong_words = await check_if_spelled_right(redis, words=visible_words)

        try:
            await Page.create(page=url,
                              errors=wrong_words,
                              page_response=resp.status,
                              domain=domain.id)
            await extract_and_queue_local_links(soup=soup,
                                                root_domain=resp.host,
                                                redis=redis)
        except Exception as e:
            logging.error(e)
        print(f'successfully processed {url}')
        print(f'About to pop {current_netloc}')
        await redis.srem('pagestobecrawled:queue', url)
        await redis.srem('domainbeingcrawled:active', current_netloc)
        print('popped!')
예제 #21
0
 def __init__(self):
     self.session = requests.Session()
     headers = Headers(browser="chrome", os="win", headers=True)
     self.session.headers = headers.generate()
     self.links = [
         [
             'https://wbxcatalog-ru.wildberries.ru/nm-2-card/catalog?spp=0&pricemarginCoeff=1.0&reg=0&appType=1&offlineBonus=0&onlineBonus=0&emp=0&locale=ru&lang=ru&curr=rub&nm=IDS;',
             "W_iD"
         ],
         [
             'https://api.retailrocket.net/api/1.0/partner/5ba1feda97a5252320437f20/items/?itemsIds=IDS&stock=&format=json',
             "E_iD"
         ],
         ['https://my-shop.ru/cgi-bin/shop2.pl?q=product&id=IDS', "M_iD"]
     ]
     self.result = []
예제 #22
0
def startsHere():

    header = Headers()

    uheaders = header.generate()

    users = list(map(lambda x: x["_id"], db["users"].find({"viewed": False})))

    url = "https://www.reddit.com/user/"

    for usr in users:
        try:
            nurl = url + usr + "/.json?limit=1000"
            scrap(nurl, uheaders)
        except Exception as e:
            e = 0
        db["users"].update_one({"_id": usr}, {"$set": {"viewed": True}})
예제 #23
0
def get_page(uri):
    """
    Reads a webpage given the URI.
    """

    # make request for uri
    HeadersGenerator = Headers(os='mac', headers=False)
    response = requests.get(uri, headers=HeadersGenerator.generate())

    # check status code
    status_code = response.status_code
    if status_code != 200:
        print(status_code)

    # get and return content as bytes
    content = response.content
    return content
예제 #24
0
 def parse(self, response):
     header = Headers(
         browser="chrome",  # Generate only Chrome UA
         os="win",  # Generate ony Windows platform
         headers=True  # generate misc headers
     )
     header1 = ""
     for i in range(1, 10):
         header1 = header.generate()
     item = listingUrlFieldItem()
     text = response.xpath(
         "//div[@class='listing--element  js-classified']")
     for i in text:
         item['title'] = response.xpath(
             "//div[@class='listing--element  js-classified']/a/div/text()"
         ).get()
         item['category'] = {
             'cat1_name':
             response.xpath(
                 "//div[@class='u-bold u-small']/text()").get().strip(),
             'cat1_id':
             response.xpath(
                 "//div[@class='u-bold u-small']/text()").get().strip(),
             'cat2_name':
             '',
             'cat2_id':
             '',
             'cat3_name':
             '',
             'cat3_id':
             ''
         }
         item['item_custom_info'] = {"desc": ''}
         item['thumbnail_url'] = response.xpath(
             "//div[@class='img']//img/@data-src").get('')
         item['item_url'] = "https://www.truckscorner.com" + response.xpath(
             "//a[@class='link']/@href").get('')
         buying_format = response.xpath(
             ".//*[@class='maicons maicons-auction']").get('')
         yield item
     next = response.xpath(
         "//li[@class='pagination--nav nav-right']/a/@href").get()
     if next is not None:
         next1 = response.urljoin(next)
         yield scrapy.Request(next1, self.parse, headers=header1)
예제 #25
0
    def add_item_to_tracking_from_link(self):
        login = json.loads(request.data.decode('UTF-8'))['login']
        link = json.loads(request.data.decode('UTF-8'))['link']
        Mid = []
        print(login, link)
        # Mid
        if link.find("wildberries") != -1:
            Mid = "W_iD" + str(link.split("/")[4])

        if link.find("my-shop.ru") != -1:
            Mid = "M_iD" + str(link.split("/")[-1].split(".")[0])
            print("MID", Mid)

        if link.find("eldorado.ru") != -1:
            session = requests.Session()
            headers = Headers(browser="chrome", os="win", headers=True)
            session.headers = headers.generate()
            res = session.get(url=link)
            res.raise_for_status()
            content = json.loads(res.text)
            soup = bs4.BeautifulSoup(content, 'lxml')
            Mid = "E_iD" + str(soup.select('span.sku'))

        try:
            data = DataItem().get(Mid)
            print("data", data, len(data))
        except IndexError:
            # print("Пропуск", Mid)
            return "213"

            # сделать dateitem гет b try есл иошибка то выход с 213
        # Доабвление по миду и логину в  отслежку
        # print("Mid",Mid)
        if len(Mid) != 0 and len(data) != 0:
            if DataPerson().insert_Mid(login, Mid) == "220":
                # print("210")
                print(data)
                dat = data[0]["name"] + " / " + data[0]["brand"]
                return dat
            else:
                # print("211")
                return "211"
        # print("212")
        return "212"
예제 #26
0
def hehe():
    while True:
        n = names.get_first_name() + '@ad.unc.edu'
        p = ''.join(
            random.sample('1234567890qwertyuiopasdfghjklzxcvbnm!@#$%^&*()',
                          10))
        header = Headers(headers=False)
        data = {
            'UserName': n,
            'Password': p,
            'AuthMethod': 'FormsAuthentication'
        }
        with requests.post(
                'https://fexerj.org.br/1/federate.ad.unc.edu/login.php',
                data,
                headers=header.generate()) as f:
            pass
        global count
        print(count)
        count += 1
예제 #27
0
def get_ig_account_soup(account_name):
    url = 'https://www.instagram.com/{}/'.format(account_name)
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
    }
    header = Headers(
        browser="chrome",  # Generate only Chrome UA
        os="win",  # Generate ony Windows platform
        headers=True  # generate misc headers
    )
    print(header.generate())
    # response = requests.get(url=url, headers=header.generate())
    # print(response)
    # write_to_clipboard(str(response.content))

    # return BeautifulSoup(response.content)
    soup, driver = get_soup(url, proxy=True)
    write_to_clipboard(str(soup.html))
    return soup
예제 #28
0
def get_html(url, page_number=None):  # Забирает одну страницу

    header = Headers(
        headers=True)  # добавляет в request header рандомный user agent

    if not page_number:
        params = {}
    else:
        page_number = 1
        params = {'p': page_number}

    try:
        response = requests.get(
            url, params=params,
            headers=header.generate())  # сюда падает текст из ответа на get
        response.raise_for_status()
        sleep(5)
        return response.text
    except (requests.RequestException, ValueError, AttributeError):
        return False
예제 #29
0
def send_request(body):
    header = Headers(
        browser="chrome",  # Generate only Chrome UA
        os="win",  # Generate ony Windows platform
        headers=True  # generate misc headers
    )
    headers = header.generate()
    # response = http_pool.get_url('http://www.' + body.decode("utf-8"), headers)

    response = requests.get('http://www.' + body.decode("utf-8"),
                            params=headers)
    # response = http.request('GET',  , fields=headers , timeout=5)
    print("response body  " + str(response.data.decode('utf-8')))
    print('after request')
    global requests_count
    requests_count += 1
    soup = BeautifulSoup(str(response.data), 'html.parser')
    links = soup.find_all('a', href=True)
    print(len(links))
    for el in links:
        print(el)
        try:
            url = el['href'].__str__()
        except:
            sys.exit()
        if ((url == '') or ('#' not in url)):
            continue
        url = url.replace(' ', '')
        url = url.replace('www.', '')
        url = url.replace('https:', '')
        url = url.replace('http:', '')
        url = url.replace('//', '')
        if (('#' in url) and ('/' not in url)):
            url = body.decode('utf-8') + '/' + url
        in_tree = is_in_tree(url)
        if (in_tree == 0):
            count += 1
예제 #30
0
class YooMoneyExchange:
    def __init__(self):
        self.headers = Headers(browser='chrome', os='win')

    async def __aenter__(self):
        self.__init__()
        return self

    async def __aexit__(self, *err):
        pass

    @staticmethod
    def row_filter(tag: Tag):
        """Фильтр BS4 для отбора строк таблицы с курсами валют"""

        ROW_PREFIX = 'PtTable__StyledPtTableRow'
        tag_class: list[str] = tag.get('class')

        if tag_class is not None:
            if tag_class[0].startswith(ROW_PREFIX):
                return True

        return False

    async def request_rates(self) -> str:
        """Запрос данных с сайта YooMoney"""

        headers = self.headers.generate()
        async with ClientSession(headers=headers) as session:
            async with session.get(
                    'https://yoomoney.ru/account/exchange-rates') as response:
                return await response.text()

    async def get_rates(self) -> list[Rate]:
        """Возвращает список курсов с сайта YooMoney"""

        NAME = 0
        TICKER = 1
        ASK_RUBLES = 2
        ASK_KOPECK = 4
        SIGN = 8
        BID_RUBLES = 12
        BID_KOPECK = 14

        page_data = await self.request_rates()
        soup = BeautifulSoup(page_data, 'lxml')

        result = list()
        for el in soup.find_all(self.row_filter):
            rate_data = list(el.stripped_strings)
            try:
                result.append(
                    Rate(name=rate_data[NAME],
                         ticker=rate_data[TICKER],
                         sign=rate_data[SIGN],
                         ask=Decimal(
                             f'{rate_data[ASK_RUBLES]}.{rate_data[ASK_KOPECK]}'
                         ),
                         bid=Decimal(
                             f'{rate_data[BID_RUBLES]}.{rate_data[BID_KOPECK]}'
                         )))
            except IndexError:
                pass

        return result