Пример #1
0
def getRSS(curso):
    """
    Downloads xml rss files from https://side.utad.pt
    Stores them into feeds/curso.xml
    :param curso: string
    :return: True
    """

    if debug: print("getRSS", curso)

    feedRSS = "https://side.utad.pt/rss.pl?" + curso
    feedFile = "feeds/" + curso + ".xml"

    if path.exists(feedFile): remove(feedFile)

    try:
        r = ProxyRequests(feedRSS)
        r.get()
        with open(feedFile, 'wb') as f:
            f.write(r.get_raw())
        if round(path.getsize(feedFile)) < 700:
            getRSS(curso)

    except (requests.exceptions.ConnectionError,
            requests.exceptions.ReadTimeout, requests.exceptions.ProxyError,
            urllib3.exceptions.MaxRetryError):
        getRSS(curso)
Пример #2
0
def get_current():
    url = "https://acrnm.com"
    site = ProxyRequests(url)
    
    failures = 0

    while True:

        print("Checking if new products are on ACRNM on proxy: {}".format(site.proxy_used))
        if not site.get().ok:
            print("Proxy or website is unresponsive. Trying again...")
            failures += 1
            site.proxy_used = site.sockets.pop(0)
            continue
        else:
            failures = 0
        
        tree = html.fromstring(str(site))
        tree.make_links_absolute(url)

        prod_names = tree.xpath("//div[@class='name']/text()")
        prod_urls = tree.xpath("//a[contains(concat(' ', normalize-space(@class), ' '), ' tile ')]/@href")

        new, restock = db.new_items(prod_names, prod_urls)

        if new:
            new = list(zip(*new))
            notify(new[1], restock)
            db.insert_products(new[0])
        else:
            notify(new, restock)

        db.insert_current(prod_names, prod_urls)
Пример #3
0
def test_post_with_headers(henry_post_bucket):
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_headers({'name': 'rootVIII', 'secret_message': '7Yufs9KIfj33d'})
    r.post_with_headers({'key1': 'value1', 'key2': 'value2'})
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(r.get_proxy_used())
Пример #4
0
def test_get_with_headers():
    h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'}
    r = ProxyRequests('https://postman-echo.com/headers')
    r.set_headers(h)
    r.get_with_headers()
    assert r.get_status_code() == 200
    assert 'headers' in r.get_json()
    print(r.get_proxy_used())
Пример #5
0
def parse_person():
    data = {}
    for person in models.Person.query.all():  #.filter_by(name_original=None)
        while True:
            try:
                r = ProxyRequests(f'{URL}{person.links}')
            except:
                break
            r.get()
            r.encoding = 'utf-8'
            text = r.request
            soup = BeautifulSoup(text, 'html.parser')
            if not soup.find('h1', {'itemprop': 'name'}):
                continue
            alternateName = soup.find('span', {'itemprop': 'alternateName'})
            if alternateName:
                person.name_original = alternateName.text
            else:
                person.name_original = person.name
            db.session.add(person)
            db.session.commit()

            list_career = []
            director = soup.find('a', {'href': '#director'})
            if director:
                egge = director.text.replace(' ', '')
                if not models.Career.query.filter_by(name=egge).first():
                    new_career = models.Career(name=egge)
                    db.session.add(new_career)
                    db.session.commit()
                    list_career.append(new_career)
                else:
                    list_career.append(
                        models.Career.query.filter_by(name=egge).first())

            actor = soup.find('a', {'href': '#actor'})
            if actor:
                egge = actor.text.replace(' ', '')
                if not models.Career.query.filter_by(name=egge).first():
                    new_career = models.Career(name=egge)
                    db.session.add(new_career)
                    db.session.commit()
                    list_career.append(new_career)
                else:
                    list_career.append(
                        models.Career.query.filter_by(name=egge).first())

            person.career.clear()
            for i in list_career:
                person.career.append(i)
            db.session.add(person)
            db.session.commit()
            break
def fetch_with_proxy(url, headers):
    r = ProxyRequests(url)
    if headers:
        r.set_headers(headers)
        r.get_with_headers()
    else:
        r.get()

    status_code = r.get_status_code()
    if status_code != 200:
        print(f"{status_code}: {url}")

    return r.get_raw()
Пример #7
0
def test_post_file(henry_post_bucket):
    with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out:
        f_out.write('testing')
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_file('/var/tmp/proxy_requests_testing.txt')
    r.post_file()
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(henry_post_bucket)
    print(r.get_proxy_used())
Пример #8
0
def gather_info(url):
    list_of_user_agents = [
        'Mozilla/5.0', 'AppleWebKit/537.36', 'Chrome/79.0.3945.88',
        'Safari/537.36'
    ]
    stat_code = 0
    tag_info = {'url': url}

    try_count = 0
    # continue attempting up to 4 proxies
    for user_agent in list_of_user_agents:
        if stat_code != 200:
            try_count += 1

            headers = {
                "User-Agent": user_agent,
                "Accept":
                "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\
                q = 0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en; q = 0.9"
            }

            r = ProxyRequests(url)
            r.set_headers(headers)
            r.get_with_headers()
            source = r.get_raw()
            stat_code = r.get_status_code()

    if try_count == len(list_of_user_agents):
        tag_info['num_of_changed_files'] = -1
        tag_info['changed_paths'] = ['ERROR, CANNOT FULFILL REQUEST']
        tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS'
        tag_info['metrics'] = {
            'num_of_changed_files': 0,
            'changes': 0,
            'additions': 0,
            'deletions': 0
        }
        return tag_info

    # proxy successful, continue reading the page
    if stat_code == 200:
        soup = BeautifulSoup(source, 'lxml')

        metrics = get_changed_files_metrics(soup)
        tag_info['metrics'] = metrics

        count, changed_files = get_changed_files(soup)
        if count == 0:
            tag_info['changed_paths'] = ['NONE FOUND']
        else:
            tag_info['changed_paths'] = changed_files

        if count != tag_info['metrics']['num_of_changed_files']:
            tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT'
        else:
            tag_info['error_found'] = 'NONE'
    return tag_info
Пример #9
0
def cricket(mid):
    while True:
        try:
            r = ProxyRequests('http://mapps.cricbuzz.com/cbzios/match/' + mid +
                              '/leanback.json')
            r.get()
            a = str(r)
            data = json.loads(a)
            bat = data['bat_team']['name']
            bow = data['bow_team']['name']
            score = int(data["comm_lines"][0]["score"])
            wicket = int(data["comm_lines"][0]["wkts"])
            over = float(data['bat_team']['innings'][0]['overs'])
            detailed_score = data["comm_lines"][0]["score"] + "/" + data[
                "comm_lines"][0]["wkts"] + " " + data['bat_team']['innings'][
                    0]['overs']
            try:
                bowler = data['bowler'][0]['name']
                batname0 = data['batsman'][0]['name']
                batname1 = data['batsman'][1]['name']
                bat0score = data['batsman'][0]['r']
                bat1score = data['batsman'][1]['r']
                bat0ball = data['batsman'][0]['b']
                bat1ball = data['batsman'][1]['b']
                bowler = bow + ":" + data['bowler'][0]['name']
                batters = batname0 + "(" + bat0score + "-" + bat0ball + ")" + batname1 + "(" + bat1score + "-" + bat1ball + ")"
                detailed_score = bat + ":" + data["comm_lines"][0][
                    "score"] + "/" + data["comm_lines"][0][
                        "wkts"] + " " + data['bat_team']['innings'][0]['overs']
            except:
                print(
                    "An exception occurred fetching either batters or bowler")
            try:
                txt = bowler + " " + batters
                print(detailed_score + " " + txt)
                aio.send('message', detailed_score + " " + txt)
            except:
                print("An exception occurred sending")
        except:
            print("An exception occurred start")
        time.sleep(10)
        global stop_threads
        if stop_threads:
            print('Stopped Cricket')
            break
Пример #10
0
def test_get():
    r = ProxyRequests('https://api.ipify.org')
    r.get()
    assert r.get_status_code() == 200
    try:
        inet_aton(r.__str__())
    except Exception:
        pytest.fail('Invalid IP address in response')
    print(r.get_proxy_used())
Пример #11
0
def prefetch():
    try:
        print("Pre-fetching")
        print(config.ur)
        r = ProxyRequests(config.ur)
        r.get()
        a=str(r)
        data=json.loads(a)
        config.series_name = data["series_name"]
        config.bat_team_name=data['bat_team']['name']
        config.twicket=int(data["comm_lines"][0]["wkts"])
        config.twicket=config.twicket+1
        config.tover=int(float(data['bat_team']['innings'][0]['overs']))
        config.tover=config.tover+1
        config.series_name="--"+config.series_name+"--"
        print(config.series_name+'\n'+config.bat_team_name)
    except:
        print("An exception occurred prefetching")
        time.sleep(5)
        prefetch()
Пример #12
0
def listofMatches():
    try:
        url='http://mapps.cricbuzz.com/cbzios/match/livematches'
        r = ProxyRequests(url)
        r.get()
        a=str(r)
        data=json.loads(a)
        matches=[]
        match_id=[]
        for i in data['matches']:
            matches.append(i)
        for i in matches:
            t= time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(i['header']['start_time'])))
            match_id.append(i['match_id'])
            print(i['match_id']+' '+ t +' '+i['series_name'])
        return match_id[0]    
    except:
        print("An exception occurred auto updating mid")
        time.sleep(2)
        listofMatches()
Пример #13
0
def parse_links():
    page = 1
    last_page = 1
    data = {}

    while page <= last_page:
        r = ProxyRequests(f'{URL}/top/navigator/m_act[rating]/1%3A/order/rating/page/{page}/#results')
        r.get()
        r.encoding = 'utf-8'
        text = r.request
        soup = BeautifulSoup(text)
        if last_page == 1:
            try:
                last_link = soup.find_all('li', {'class': 'arr'})[-1].find('a').get('href')
                last_page = int(re.findall(r'\d{2,}', last_link)[0])
            except:
                continue

        movie_link = soup.find_all('div', {'class': '_NO_HIGHLIGHT_'})
        if not movie_link:
            continue

        for i in movie_link:
            i_soup = BeautifulSoup(f'b{i}').find('div', {'class': 'name'}).find('a')
            i_text = i_soup.text
            i_link = i_soup.get('href')
            id_film = int(re.findall(r'\d{1,}', i_link)[1])
            if models.Film.query.filter_by(id_film=id_film).first() == None:
                film = models.Film(id_film=id_film, links=i_link, name=i_text)
                db.session.add(film)
                try:
                    db.session.commit()
                except Exception:
                    db.session.rollback()
                    data[i_text] = {page:i_link}
                    continue

        page += 1
    with open('data.txt', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)
Пример #14
0
 def send(self):
     try:
         if not self.proxy:
             r = requests.post(self.url,
                               data=self.data,
                               headers=self.headers)
         elif self.proxy:
             r = ProxyRequests(self.url)
             r.set_headers(self.headers)
             r.post_with_headers(self.data)
         else:
             sys.stdout.write(
                 '\r[!] Ein Fehler ist aufgetreten                                                      \r\n'
             )
         if self.check in str(r) or self.check in r.text:
             self.success += 1
             sys.stdout.write(
                 '\r[+] Für %s abgestimmt                                                           \r\n'
                 % self.name)
         else:
             sys.stdout.write(
                 '\r[!] Ein Fehler ist aufgetreten                                                      \r\n'
             )
     except:
         sys.stdout.write(
             '\r[!] Ein Fehler ist aufgetreten                                                      \r\n'
         )
     self.trys += 1
def thread_get_info(url):
    stat_code = 0
    this_tag_info = {}
    this_tag_info['url'] = url

    try_count = 0
    # continue collecting proxies for up to 10 tries
    while stat_code != 200:
        try_count += 1
        if try_count > 10:
            this_tag_info['num_changed_files'] = -1
            this_tag_info['changed_paths'] = ['NONE FOUND']
            this_tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS'
            return this_tag_info

        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept":
            "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\
            q = 0.8, application/signed-exchange; v = b3",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en; q = 0.9"
        }

        r = ProxyRequests(url)
        r.set_headers(headers)
        r.get_with_headers()
        source = r.get_raw()
        stat_code = r.get_status_code()

    # proxy successful, continue reading the page
    if stat_code == 200:
        soup = bs.BeautifulSoup(source, 'lxml')

        # get changed files info
        read_count = get_num_changed_files(soup)
        this_tag_info['num_changed_files'] = read_count

        count, changed_files = get_changed_files(soup)
        if count == 0:
            this_tag_info['changed_paths'] = ['NONE FOUND']
        else:
            this_tag_info['changed_paths'] = changed_files

        if count != read_count:
            this_tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT'
        else:
            this_tag_info['error_found'] = 'OK'

    return this_tag_info
Пример #16
0
    def __init__(self, query: str):
        """
        Na inicialização é realizada a requisição com as headers, e obtendo a resposta JSON da mesma
        para permitir as demais propriedades.
        :param query:
        """
        from urllib.parse import quote
        from proxy_requests import ProxyRequests
        import json

        headers = {"User-Agent": self.user_agent}

        req = ProxyRequests(self.RA_SEARCH.format(quote(
            query.encode("utf-8"))))
        req.set_headers(headers)
        req.get_with_headers()

        self.__response = json.loads(req.get_raw().decode())
Пример #17
0
    def crawl_img(image_row):
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        print(asin)
        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object)
            #df_imgs = df_imgs.append(df_img)
            #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg")

            print("Successfully crawled image: %s" % (asin))
        else:
            print("Could not crawl image: %s" % (asin))
Пример #18
0
def all_team_names(url_root):
    url = os.path.join(url_root, "teams") + "/"
    r = ProxyRequests(url)
    r.get()
    # print ip used
    print(r.get_proxy_used())
    soup = BeautifulSoup(r.get_raw(), "html.parser")
    tabs = soup.find_all("table")
    # active franchise: tabs[0] bc two tables on url, then pd_read_html returns a list
    df_active = pd.read_html(tabs[0].prettify())[0]
    # filter to max years, which is the main franchise. Do you need this?

    # Extract all the hrefs for the active teams:
    team_a_links = tabs[0].find_all("a", href=True)
    team_names = {
        t["href"].replace("teams", "").replace("/", ""): t.text
        for t in team_a_links if "/teams/" in t["href"]
    }
    return team_names
Пример #19
0
def codechecker(code):
        try:
                r = ProxyRequests("https://discordapp.com/api/v6/entitlements/gift-codes/$%s?with_application=false&with_subscription_plan=true" % (code))
                r.get()
                JsonResponse = r.get_json()
                Response = JsonResponse["message"]
                if Response == "Unknown Gift Code":
                        print(f"\x1b[31;1mInvaild Code {code}\n")
                        return
                if Response == "You are being rate limited.":
                        return
                        print("\x1b[31;1mYou Are Being Rate Limited.")
                else:
                        print(f"\x1b[31;1mFound Working Code {code} Site Response:{Response}\n")
                        response = ProxyRequests("https://discordapp.com/api/v6/entitlements/gift-codes/{code}/redeem", json={"channel_id":str(message.channel.id)}, headers={'authorization':token})
                        redeemedcode = (response.text)
                        return
        except Exception as e:
                print(e)
                return
Пример #20
0
def parse_films():
    engine = create_engine('sqlite:///:memory:', echo=True)

    data = {}
    for film in models.Film.query.filter_by(rating_kp=None).all():  #
        while True:
            try:
                r = ProxyRequests(f'{URL}{film.links}')
            except:
                break
            r.get()
            r.encoding = 'utf-8'
            text = r.request
            soup = BeautifulSoup(text, 'html.parser')
            genres = soup.find('span', {'itemprop': 'genre'})
            if genres:
                genres = genres.find_all('a')
                countrys = soup.find_all(
                    'div', {'style': 'position: relative'})[1].find_all('a')
                persons = soup.find_all('li', {'itemprop': 'actors'})
                for director in soup.find_all('td', {'itemprop': 'director'}):
                    persons.append(director)
                break
        list_genres = []
        for genre in genres:
            if not models.Genre.query.filter_by(name=genre.text).first():
                while True:
                    new_genre = models.Genre(name=genre.text)
                    db.session.add(new_genre)
                    try:
                        db.session.commit()
                        list_genres.append(new_genre)
                        break
                    except Exception:
                        db.session.rollback()
            else:
                list_genres.append(
                    models.Genre.query.filter_by(name=genre.text).first())

        list_countrys = []
        for country in countrys:
            if not models.Country.query.filter_by(name=country.text).first():
                while True:
                    new_country = models.Country(name=country.text)
                    db.session.add(new_country)
                    try:
                        db.session.commit()
                        list_countrys.append(new_country)
                        break
                    except Exception:
                        db.session.rollback()
            else:
                list_countrys.append(
                    models.Country.query.filter_by(name=country.text).first())

        list_person = []
        for person in persons:
            if person.find('a').text.replace(' ', '') == '...':
                break
            person_link = person.find('a').get('href')
            if not models.Person.query.filter_by(id_person_kp=int(
                    re.findall(r'\d{1,}', person_link)[0])).first():
                while True:
                    # person_link = person.find('a').get('href')
                    if models.Person.query.filter_by(id_person_kp=int(
                            re.findall(r'\d{1,}', person_link)[0])).first():
                        break
                    id_person_kp = int(re.findall(r'\d{1,}', person_link)[0])
                    new_person = models.Person(name=person.text,
                                               links=person_link,
                                               id_person_kp=id_person_kp)
                    db.session.add(new_person)
                    try:
                        db.session.commit()
                        list_person.append(new_person)
                        break
                    except Exception:
                        db.session.rollback()
            else:
                if not models.Person.query.filter_by(id_person_kp=int(
                        re.findall(r'\d{1,}', person_link)
                    [0])).first() in list_person:
                    list_person.append(
                        models.Person.query.filter_by(id_person_kp=int(
                            re.findall(r'\d{1,}', person_link)[0])).first())

        # if not film.description:
        while True:
            try:
                film.name = soup.find('span', {
                    'class': 'moviename-title-wrapper'
                }).text
                film.name_original = film.name if not soup.find(
                    'span', {
                        'class': 'alternativeHeadline'
                    }).text else soup.find('span', {
                        'class': 'alternativeHeadline'
                    }).text
                film.description = soup.find('div', {
                    'itemprop': 'description'
                }).text.replace(chr(151), '-')
                film.rating_kp = float(
                    soup.find('span', {
                        'class': 'rating_ball'
                    }).text)
                film.rating_imdb = float(
                    re.findall(
                        r'[\d][^ ]+',
                        soup.find('div', {
                            'style':
                            'color:#999;font:100 11px tahoma, verdana'
                        }).text)[0])
                film.date_released = int(
                    soup.find('div', {
                        'style': 'position: relative'
                    }).find('a').text)
                try:
                    db.session.commit()
                except Exception:
                    db.session.rollback()
                    continue
                film.genre.clear()
                film.country.clear()
                film.person.clear()

                while True:
                    for i in list_genres:
                        film.genre.append(i)
                    for i in list_countrys:
                        film.country.append(i)
                    for i in list_person:
                        film.person.append(i)
                    db.session.add(film)
                    try:
                        db.session.commit()
                        break
                    except Exception:
                        db.session.rollback()
                break

            except:
                db.session.rollback()
 version3 = random.randint(0, 10)
 windows_64_or_not_randomize = random.randint(
     0, 1)
 fake_rv = random.randint(0, 100)
 if windows_64_or_not_randomize == "1":
     windows_x64_or_not_2 = "32"
     windows_x64_or_not = "86"
 else:
     windows_x64_or_not = "64"
 if f'{url}/' in url:
     h = {
         'User-Agent':
         f'mouseTor/{version1}.{version2}.{version3} (Windows NT {os_Version}.0; Win64; x86; rv:{fake_rv}.0) mouseTor Relay/A9H8G88F mouseTor/1.0.0'
     }
     r = ProxyRequests('http://' +
                       host_2[1] + "/" +
                       fileget[1])
     r.set_headers(h)
     data = r.get_with_headers()
     print(html2text.html2text(str(data)))
     break
 else:
     os_Version = random.randint(5, 10)
     h = {
         'User-Agent':
         f'mouseTor/{version1}.{version2}.{version3} (Windows NT {os_Version}.0; Win64 x86; rv:{fake_rv}.0) mouseTor Relay/A9H8G88F mouseTor/1.0.0'
     }
     r = ProxyRequests('http://' +
                       host_2[1])
     r.set_headers(h)
     r.get_with_headers()
Пример #22
0
    def bs_scrap_price(self, shop_link, domain, price_tag_name,
                       price_attr_name, price_tag_name_2, price_attr_values,
                       title_tag_name, title_attr_name, title_attr_value):
        n = 3
        while n > 0:
            user_agents = [
                'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0)'
                ' Gecko/20100101 Firefox/70.0',
                'Mozilla/5.0 (X11; Linux x86_64) '
                'AppleWebKit/537.36 (KHTML, like Gecko)'
                'Ubuntu Chromium/77.0.3865.90 Chrome/77.0.3865.90'
                ' Safari/537.36', 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) '
                'Presto/2.12.388 Version/12.16'
            ]

            # random choose user agent to hide your bot for the site
            user_agent = random.choice(user_agents)
            header = {
                'User-Agent': user_agent,
                'Host': domain,
                'Accept': 'text/html,application/'
                'xhtml+xml,'
                'application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-us,en;q=0.5',
                'Accept-Encoding': 'gzip,deflate',
                'Accept-Charset': 'ISO-8859-1,'
                'utf-8;q=0.7,*;q=0.7',
                'Keep-Alive': '115',
                'Connection': 'keep-alive'
            }
            try:
                r = ProxyRequests(shop_link)
                r.set_headers(header)
                r.get_with_headers()
                res = str(r)
            except Exception as error:
                return False, error
            if str(res) == '<Response [404]>':  # handling 404 error exception
                error = 'The page was not found'
                return False, error

            # creating soup object of the source
            soup = bs4.BeautifulSoup(res, features="html.parser")
            price = product_title = None
            for price_attr_value in price_attr_values:
                # finding price on the page
                try:
                    if price_tag_name_2 == "":
                        price = str(
                            soup.find(
                                price_tag_name,
                                attrs={price_attr_name: price_attr_value}))
                    else:
                        price = str(
                            soup.find(price_tag_name,
                                      attrs={
                                          price_attr_name: price_attr_value
                                      }).find(price_tag_name_2))
                        print(price)
                    product_title = (soup.find(
                        title_tag_name, {
                            title_attr_name: title_attr_value
                        }).text.lstrip())
                except Exception:
                    pass
            # if price isn't None breake the while loop and continues
            # our function
            if price != None and product_title != None:
                return price, product_title.lstrip()
            n -= 1
            time.sleep(random.randint(5, 10))

        return False, "Can't find price or product title on the web page"
Пример #23
0
 def __init__(self):
     self.proxy_requests = ProxyRequests()
Пример #24
0
def test_post_file_with_headers(henry_post_bucket):
    with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out:
        f_out.write('testing')
    h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'}
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_headers(h)
    r.set_file('/var/tmp/proxy_requests_testing.txt')
    r.post_file_with_headers()
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(henry_post_bucket)
    print(r.get_proxy_used())
Пример #25
0
def score():
    try:
        r = ProxyRequests(config.ur)
        r.get()
        a = str(r)
        data = json.loads(a)
        score = int(data["comm_lines"][0]["score"])
        wicket = int(data["comm_lines"][0]["wkts"])
        over = float(data['bat_team']['innings'][0]['overs'])
        detailed_score = config.bat_team_name + " " + data["comm_lines"][0][
            "score"] + "/" + data["comm_lines"][0]["wkts"] + " " + data[
                'bat_team']['innings'][0]['overs']
        print(detailed_score, end=" ")

        try:
            bowler = data['bowler'][0]['name']
            print("B:" + bowler)
            batname0 = data['batsman'][0]['name']
            batname1 = data['batsman'][1]['name']
            bat0score = data['batsman'][0]['r']
            bat1score = data['batsman'][1]['r']
            bat0ball = data['batsman'][0]['b']
            bat1ball = data['batsman'][1]['b']
            bowler = data['bowler'][0]['name']
            batters = batname0 + "*(" + bat0score + "-" + bat0ball + ") " + batname1 + "(" + bat1score + "-" + bat1ball + ")"
            print(batters)
            fputOnRdb(detailed_score + "     B: " + bowler + "\n" + batters +
                      "\nRecent:\n" + data['prev_overs'])
        except:
            print("An exception occurred fetching either batters or bowler")
        try:
            if (over == (config.tover - 1.0 + 0.5)):
                global bow
                bow = bowler
            if over == config.tover:
                prev_overs = data['prev_overs']
                prev_over = prev_overs.split('|')
                msg = detailed_score + " B:" + bow + "\n" + batters + "\n" + prev_over[
                    -1]
                print(msg)
                notify(msg)
                config.tover = config.tover + 1
                fbpush(msg)
                updateRegIds()
                time.sleep(10)
            if wicket == config.twicket:
                msg = "wicket " + str(
                    config.twicket
                ) + " " + data['last_wkt_name'] + " " + data[
                    'last_wkt_score'] + " B: " + bowler + "\n" + detailed_score
                fbpush(msg)
                notify(msg)
                config.twicket = config.twicket + 1
                time.sleep(15)
            if (int(over + 1) != config.tover):
                updateRegIds()
                prefetch()
            if ((wicket + 1) != config.twicket):
                updateRegIds()
                prefetch()
        except:
            print("An exception occurred while trying to notify")
    except:
        print("An exception occurred fetching score")
Пример #26
0
def rotate_proxy(test_url=BASE_URL + '/version'):
    rotator = ProxyRequests(test_url)
    rotator.get()
    proxy = rotator.get_proxy_used
    proxies = {'http': 'http://%s' % proxy, 'https': 'https://%s' % proxy}
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument(
        '--number_images',
        default=10,
        type=int,
        help=
        'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    number_images = args.number_images

    # get all arguments
    args = parser.parse_args()

    # get already crawled asin list
    #asin_crawled_list = get_asin_images_crawled("mba_de.products_images")

    df_images = get_images_urls_not_crawled(marketplace)

    # if number_images is equal to 0, evry image should be crawled
    if number_images == 0:
        number_images = len(df_images)

    for j, image_row in df_images.iloc[0:number_images].iterrows():
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        url_image_lowq = image_row["url_image_lowq"]

        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
        #proxy_list = get_proxies("de", True)
        #proxy = next(iter(proxy_list))
        #proxies={"http": proxy, "https": proxy}

        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            utils.upload_blob(
                "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg",
                "mba-shirts/" + marketplace + "/" + asin + ".jpg")
            df_img = pd.DataFrame(data={
                "asin": [asin],
                "url": [
                    "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"
                    + marketplace + "/" + asin + ".jpg"
                ],
                "url_gs": [
                    "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" +
                    marketplace + "/" + asin + ".jpg"
                ],
                "url_mba_lowq": [url_image_lowq],
                "url_mba_hq": [url_image_hq],
                "timestamp": [datetime.datetime.now()]
            },
                                  dtype=np.object)
            df_img['timestamp'] = df_img['timestamp'].astype('datetime64')
            df_img.to_gbq("mba_" + marketplace + ".products_images",
                          project_id="mba-pipeline",
                          if_exists="append")
            print("Successfully crawled image: %s | %s of %s" %
                  (asin, j + 1, number_images))
        else:
            print("Could not crawl image: %s | %s of %s" (asin, j + 1,
                                                          number_images))

        #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True)
        test = 0

    bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7"
    folder_name = "mba-shirts"
    file_path = "mba-pipeline/crawler/mba/data/test.jpg"
    #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg")

    test = 0
Пример #28
0
def test_post(henry_post_bucket):
    r = ProxyRequests(henry_post_bucket + '/post')
    r.post({'key1': 'value1', 'key2': 'value2'})
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(r.get_proxy_used())
Пример #29
0
from proxy_requests import ProxyRequests
from proxyValidator import ProxyValidator

# proxyInstance = ProxyValidator(['207.154.231.217:3128'])
# print(proxyInstance.validated_proxies)

r = ProxyRequests("https://api.ipify.org")
print(r.get())
Пример #30
0
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10',
    'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.14.1) Presto/2.12.388 Version/12.16',
    'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14'
]
user_agent = random.choice(user_agent_list)

#proxy_list = get_proxies()
#proxy =  random.choice(proxy_list)

#request = urllib.request.Request(url,headers={'User-Agent': user_agent})
#response = urllib.request.urlopen(request)

#response = requests.get(url,proxies={"http": proxy, "https": proxy},headers={'User-Agent': user_agent})

r = ProxyRequests(url)
r.get()
html = str(r)  #response.content

soup = BeautifulSoup(html, 'html.parser')
print(soup.contents[36].table.tr.next_sibling.next_sibling.next_sibling.
      next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.
      next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.
      next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.
      next_sibling.next_sibling.td.next_sibling.next_sibling.next_sibling.
      next_sibling.next_sibling.next_sibling.next_sibling.b.string)

#/html/body/table[3]/tbody/tr[1]/td/table/tbody/tr[7]/td/table/tbody/tr[11]/td[6]/b