Exemplo n.º 1
0
def getRSS(curso):
    """
    Downloads xml rss files from https://side.utad.pt
    Stores them into feeds/curso.xml
    :param curso: string
    :return: True
    """

    if debug: print("getRSS", curso)

    feedRSS = "https://side.utad.pt/rss.pl?" + curso
    feedFile = "feeds/" + curso + ".xml"

    if path.exists(feedFile): remove(feedFile)

    try:
        r = ProxyRequests(feedRSS)
        r.get()
        with open(feedFile, 'wb') as f:
            f.write(r.get_raw())
        if round(path.getsize(feedFile)) < 700:
            getRSS(curso)

    except (requests.exceptions.ConnectionError,
            requests.exceptions.ReadTimeout, requests.exceptions.ProxyError,
            urllib3.exceptions.MaxRetryError):
        getRSS(curso)
Exemplo n.º 2
0
def gather_info(url):
    list_of_user_agents = [
        'Mozilla/5.0', 'AppleWebKit/537.36', 'Chrome/79.0.3945.88',
        'Safari/537.36'
    ]
    stat_code = 0
    tag_info = {'url': url}

    try_count = 0
    # continue attempting up to 4 proxies
    for user_agent in list_of_user_agents:
        if stat_code != 200:
            try_count += 1

            headers = {
                "User-Agent": user_agent,
                "Accept":
                "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\
                q = 0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en; q = 0.9"
            }

            r = ProxyRequests(url)
            r.set_headers(headers)
            r.get_with_headers()
            source = r.get_raw()
            stat_code = r.get_status_code()

    if try_count == len(list_of_user_agents):
        tag_info['num_of_changed_files'] = -1
        tag_info['changed_paths'] = ['ERROR, CANNOT FULFILL REQUEST']
        tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS'
        tag_info['metrics'] = {
            'num_of_changed_files': 0,
            'changes': 0,
            'additions': 0,
            'deletions': 0
        }
        return tag_info

    # proxy successful, continue reading the page
    if stat_code == 200:
        soup = BeautifulSoup(source, 'lxml')

        metrics = get_changed_files_metrics(soup)
        tag_info['metrics'] = metrics

        count, changed_files = get_changed_files(soup)
        if count == 0:
            tag_info['changed_paths'] = ['NONE FOUND']
        else:
            tag_info['changed_paths'] = changed_files

        if count != tag_info['metrics']['num_of_changed_files']:
            tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT'
        else:
            tag_info['error_found'] = 'NONE'
    return tag_info
def fetch_with_proxy(url, headers):
    r = ProxyRequests(url)
    if headers:
        r.set_headers(headers)
        r.get_with_headers()
    else:
        r.get()

    status_code = r.get_status_code()
    if status_code != 200:
        print(f"{status_code}: {url}")

    return r.get_raw()
def thread_get_info(url):
    stat_code = 0
    this_tag_info = {}
    this_tag_info['url'] = url

    try_count = 0
    # continue collecting proxies for up to 10 tries
    while stat_code != 200:
        try_count += 1
        if try_count > 10:
            this_tag_info['num_changed_files'] = -1
            this_tag_info['changed_paths'] = ['NONE FOUND']
            this_tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS'
            return this_tag_info

        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept":
            "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\
            q = 0.8, application/signed-exchange; v = b3",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en; q = 0.9"
        }

        r = ProxyRequests(url)
        r.set_headers(headers)
        r.get_with_headers()
        source = r.get_raw()
        stat_code = r.get_status_code()

    # proxy successful, continue reading the page
    if stat_code == 200:
        soup = bs.BeautifulSoup(source, 'lxml')

        # get changed files info
        read_count = get_num_changed_files(soup)
        this_tag_info['num_changed_files'] = read_count

        count, changed_files = get_changed_files(soup)
        if count == 0:
            this_tag_info['changed_paths'] = ['NONE FOUND']
        else:
            this_tag_info['changed_paths'] = changed_files

        if count != read_count:
            this_tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT'
        else:
            this_tag_info['error_found'] = 'OK'

    return this_tag_info
Exemplo n.º 5
0
    def __init__(self, query: str):
        """
        Na inicialização é realizada a requisição com as headers, e obtendo a resposta JSON da mesma
        para permitir as demais propriedades.
        :param query:
        """
        from urllib.parse import quote
        from proxy_requests import ProxyRequests
        import json

        headers = {"User-Agent": self.user_agent}

        req = ProxyRequests(self.RA_SEARCH.format(quote(
            query.encode("utf-8"))))
        req.set_headers(headers)
        req.get_with_headers()

        self.__response = json.loads(req.get_raw().decode())
Exemplo n.º 6
0
def all_team_names(url_root):
    url = os.path.join(url_root, "teams") + "/"
    r = ProxyRequests(url)
    r.get()
    # print ip used
    print(r.get_proxy_used())
    soup = BeautifulSoup(r.get_raw(), "html.parser")
    tabs = soup.find_all("table")
    # active franchise: tabs[0] bc two tables on url, then pd_read_html returns a list
    df_active = pd.read_html(tabs[0].prettify())[0]
    # filter to max years, which is the main franchise. Do you need this?

    # Extract all the hrefs for the active teams:
    team_a_links = tabs[0].find_all("a", href=True)
    team_names = {
        t["href"].replace("teams", "").replace("/", ""): t.text
        for t in team_a_links if "/teams/" in t["href"]
    }
    return team_names
Exemplo n.º 7
0
    def crawl_img(image_row):
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        print(asin)
        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object)
            #df_imgs = df_imgs.append(df_img)
            #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg")

            print("Successfully crawled image: %s" % (asin))
        else:
            print("Could not crawl image: %s" % (asin))
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument(
        '--number_images',
        default=10,
        type=int,
        help=
        'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    number_images = args.number_images

    # get all arguments
    args = parser.parse_args()

    # get already crawled asin list
    #asin_crawled_list = get_asin_images_crawled("mba_de.products_images")

    df_images = get_images_urls_not_crawled(marketplace)

    # if number_images is equal to 0, evry image should be crawled
    if number_images == 0:
        number_images = len(df_images)

    for j, image_row in df_images.iloc[0:number_images].iterrows():
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        url_image_lowq = image_row["url_image_lowq"]

        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
        #proxy_list = get_proxies("de", True)
        #proxy = next(iter(proxy_list))
        #proxies={"http": proxy, "https": proxy}

        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            utils.upload_blob(
                "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg",
                "mba-shirts/" + marketplace + "/" + asin + ".jpg")
            df_img = pd.DataFrame(data={
                "asin": [asin],
                "url": [
                    "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"
                    + marketplace + "/" + asin + ".jpg"
                ],
                "url_gs": [
                    "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" +
                    marketplace + "/" + asin + ".jpg"
                ],
                "url_mba_lowq": [url_image_lowq],
                "url_mba_hq": [url_image_hq],
                "timestamp": [datetime.datetime.now()]
            },
                                  dtype=np.object)
            df_img['timestamp'] = df_img['timestamp'].astype('datetime64')
            df_img.to_gbq("mba_" + marketplace + ".products_images",
                          project_id="mba-pipeline",
                          if_exists="append")
            print("Successfully crawled image: %s | %s of %s" %
                  (asin, j + 1, number_images))
        else:
            print("Could not crawl image: %s | %s of %s" (asin, j + 1,
                                                          number_images))

        #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True)
        test = 0

    bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7"
    folder_name = "mba-shirts"
    file_path = "mba-pipeline/crawler/mba/data/test.jpg"
    #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg")

    test = 0