Пример #1
0
def saveImage(url, path):
	try:
		image = urllib.URLopener()
		image.retrieve(url, path)
	except:
		print("Error saving image")
		iCount -= 1
Пример #2
0
def download_one(filename, expected_bytes, debug=0, gz=0):
    """
    Download a file if not present, and make sure it's the right size.
    Files are stored in \'data\' folder
    """
    filename = filename + ".gz"
    filepath = mnist_download_folder + filename

    if not os.path.exists(mnist_download_folder):
        os.makedirs(mnist_download_folder)

    if not os.path.exists(filepath):
        print("Downloading ", filename, " ...")
        file_download = ur.URLopener()
        file_download.retrieve(mnist_url + filename, filepath)
        statinfo = os.stat(filepath)
        if statinfo.st_size == expected_bytes:
            if (debug):
                print("Found and verified", filepath)
        else:
            raise Exception(
                "Failed to verify " + filename +
                ". Can you get to it with a browser? \nDownload .gz files from http://yann.lecun.com/exdb/mnist/ and store in mnist_download folder"
            )
    else:
        print("Found and verified", filepath)

    return filepath
Пример #3
0
def download_od_model():
    """
    Downloads a mobile model from the Tensorflow model zoo and prepares it for usage in
    Tensorflow Serving.
    """
    model_name = 'ssd_mobilenet_v2_coco_2018_03_29'
    fname = '{}.tar.gz'.format(model_name)
    url = "http://download.tensorflow.org/models/object_detection/{}".format(fname)
    mobile_dir = os.path.join(model_dir, model_name)

    if not os.path.exists(mobile_dir):
        os.mkdir(mobile_dir)
        file = urllib.URLopener()
        file.retrieve(url, fname)

        tar = tarfile.open(fname, "r:gz")
        tar.extractall('models')
        tar.close()
        os.remove(fname)

        checkpoint_dir = os.path.join(mobile_dir, '1')
        os.rename(os.path.join(mobile_dir, 'saved_model'), checkpoint_dir)
        shutil.move(os.path.join(mobile_dir, 'checkpoint'),
                    os.path.join(checkpoint_dir, 'checkpoint'))
        shutil.move(os.path.join(mobile_dir, 'frozen_inference_graph.pb'),
                    os.path.join(checkpoint_dir, 'frozen_inference_graph.pb'))
        shutil.move(os.path.join(mobile_dir, 'model.ckpt.data-00000-of-00001'),
                    os.path.join(checkpoint_dir, 'model.ckpt.data-00000-of-00001'))
        shutil.move(os.path.join(mobile_dir, 'model.ckpt.index'),
                    os.path.join(checkpoint_dir, 'model.ckpt.index'))
        shutil.move(os.path.join(mobile_dir, 'model.ckpt.meta'),
                    os.path.join(checkpoint_dir, 'model.ckpt.meta'))
        shutil.move(os.path.join(mobile_dir, 'pipeline.config'),
                    os.path.join(checkpoint_dir, 'pipeline.config'))
Пример #4
0
def retrieve_pics(photos: list) -> list:
    """
    Downloads photo from absolute url retrieved from `get_all_info()`
    function. This also added file name details into PhotoObjects and
    returns same list with updated parameter :param photos: List of all
    PhotoObjects :return: Same list with `file_name' field updated
    """
    file_counter = 1  # Counter for files
    if not os.path.exists(photo_store_folder):
        # Make folder if it does not exists
        os.makedirs(photo_store_folder)
    if not os.path.exists(winner_email_folder):
        # Make folder if it does not exists
        os.makedirs(winner_email_folder)

    for p2 in photos:
        # Create file name Besides prefix, add little bit of title. Remove
        # all the special characters from title and use string sequence
        filename = photo_prefix + "%d_%s" % (
            file_counter, re.sub('[^A-Za-z0-9]+', '', p2.title))
        filename = filename[:20] + ".jpg"  # If name if more than 20 characters,
        # strip it and add file extension
        p2.file_name = filename  # Update file_name field in the PhotoObject
        testfile = request.URLopener()  # start downloading
        testfile.retrieve(p2.photo_url, photo_store_folder + filename)  # Save
        file_counter += 1

    return photos
Пример #5
0
def main():

    while True:

        # Read stream
        urllib.URLopener().retrieve('https://s3.amazonaws.com/hctn/after.jpg',
                                    'after.jpg')
        img = cv.imread('after.jpg')
Пример #6
0
 def stop(self):
     """Stops the server."""
     self.stop_serving = True
     try:
         # This is to force stop the server loop
         urllib_request.URLopener().open("http://%s:%d" % (self.host, self.port))
     except IOError:
         pass
Пример #7
0
def load(url, file_name, folder):

    #downloads file from url
    testfile = request.URLopener()
    testfile.retrieve(url, file_name)

    #un-zips file and puts contents in folder
    a = py7z_extractall.un7zip(file_name)
    a.extractall(folder)
Пример #8
0
 def stop(self):
     self.stop_serving = True
     try:
         # This is to force stop the server loop
         urllib_request.URLopener().open('http://{}:{}'.format(
             self.host, self.port))
     except IOError:
         pass
     logging.info('Shutting down the webserver')
     self.thread.join()
Пример #9
0
def download_story(media_url, save_path):
    if not os.path.exists(save_path):
        try:
            urllib.URLopener().retrieve(media_url, save_path)
            return True
        except Exception as e:
            log_warn("The story could not be downloaded: {:s}".format(str(e)))
            return "Error"
    else:
        return False
Пример #10
0
 def stop(self):
     """Stops the server."""
     self.stop_serving = True
     try:
         # This is to force stop the server loop
         urllib_request.URLopener().open("http://%s:%d" % (self.host, self.port))
     except IOError:
         pass
     LOGGER.info("Shutting down the webserver")
     self.thread.join()
Пример #11
0
def load_file(tpl: Tuple[Any, Any, Any]) -> None:
    branch, build_type, build_number = tpl
    file_path = os.path.join(tempdir(), 'argus', branch, build_type, '{}.json'.format(build_number))

    try:
        if not os.path.exists(file_path):
            request.URLopener().retrieve(
                '{}/job/{}-{}-{}/{}/testReport/api/json'.format(Config.JENKINS_URL,
                                                                Config.JENKINS_PROJECT, branch,
                                                                build_type, build_number),
                file_path)
    except IOError as e:
        print('Can not download {}'.format(build_number))
        print(e)
def img_download(object_id, obj_class, ra, dec):
    """
    Function that when given an right ascension and declination values downloads an image from a SDSS mirror
    for Data Release 8. The image is then saved within a folder for its class with its given object id.
    :param object_id: The Galaxy Zoo Object ID for this image.
    :param obj_class: The Galaxy Class.
    :param ra: Right Ascension value from the Galaxy Zoo .csv.
    :param dec: The Declination value from the Galaxy Zoo .csv.
    :return:
    """
    url = 'http://skyservice.pha.jhu.edu/DR8/ImgCutout/getjpeg.aspx?ra={}&dec={}&scale=0.2&width=240&height=240&opt='\
        .format(ra, dec)
    outfile = gal_data_path + '{}/{}.jpg'.format(obj_class, object_id)
    image = request.URLopener()
    image.retrieve(url, outfile)
Пример #13
0
def _download_one(filename):
    """
    Download a file if not present
    Default save path is "data/" folder
    """

    filepath = adult_download_folder + filename

    if not os.path.exists(adult_download_folder):
        os.makedirs(adult_download_folder)

    if not os.path.exists(filepath):
        print("Downloading ", filename, " ...")
        file_download = ur.URLopener()
        file_download.retrieve(adult_url + filename, filepath)
Пример #14
0
def _download_one(filename):
    """
    Download a file if not present
    Default save path is "data/" folder
    """

    filepath = health_download_folder + filename

    if not os.path.exists(health_download_folder):
        os.makedirs(health_download_folder)

    if not os.path.exists(filepath):
        print("Downloading ", filename, " ...")
        file_download = ur.URLopener()
        file_download.retrieve(health_url + filename, filepath)
    else:
        print("Found and verified ", filepath)
Пример #15
0
def pushtodatabase(bookdetails, name):
    pushothermodels('language', bookdetails['language'])
    pushothermodels('genre', bookdetails['genre'])
    pushothermodels('authors', bookdetails['authors'])

    bookdic = {}
    bookdic['title'] = bookdetails['title']
    bookdic['pageCount'] = bookdetails['pageCount']
    bookdic['isbn'] = bookdetails['isbn']
    bookdic['pages'] = bookdetails['pageCount']

    languageids = Language.objects.filter(
        name=bookdetails['language']).values('id')[0]['id']
    bookdic['language'] = str(languageids)

    genreid = Genre.objects.filter(
        name=bookdetails['genre']).values('id')[0]['id']
    bookdic['genre'] = [str(genreid)]

    authorlist = []
    for author in bookdetails['authors']:
        authorid = Author.objects.filter(
            name=author.capitalize()).values('id')[0]['id']
        authorlist.append(str(authorid))
    bookdic['author'] = authorlist

    bookfiles = {}
    # bookfiles['pdf'] = File(open(name,'rb'), os.path.basename(name))
    bookfiles['pdf'] = File(open(name, 'rb'),
                            bookdic['title'] + os.path.splitext(name)[1])
    bookfiles['epub'] = File(open(name, 'rb'),
                             bookdic['title'] + os.path.splitext(name)[1])

    testfile = request.URLopener()
    coverimagename = bookdetails['title'] + '.jpg'
    testfile.retrieve(bookdetails['image'], coverimagename)
    bookfiles['cover'] = File(open(coverimagename, 'rb'),
                              coverimagename.replace('pdfbooks/', ''))

    form = BookForm(bookdic, bookfiles)
    if form.is_valid():
        form.save()
    else:
        print(form.errors)
    os.remove(coverimagename)
Пример #16
0
def download_oid_gt():
    """
    Download groundtruth data from the Open Images Dataset
    """
    url = 'https://storage.googleapis.com/openimages/2018_04/'

    url_to_train_gt = os.path.join(url, 'train', train_gt_file)
    url_to_test_gt = os.path.join('test', test_gt_file)
    url_to_val_gt = os.path.join('validation', val_gt_file)

    file = urllib.URLopener()

    if not os.path.exists(path_to_test_gt_file):
        file.retrieve(os.path.join(url, url_to_test_gt), path_to_test_gt_file)
    if not os.path.exists(path_to_val_gt_file):
        file.retrieve(os.path.join(url, url_to_val_gt), path_to_val_gt_file)
    if not os.path.exists(path_to_train_gt_file):
        file.retrieve(os.path.join(url, url_to_train_gt),
                      path_to_train_gt_file)
Пример #17
0
def find_and_convert(root, srch_str):
    """
    Searches google images for the srch_str provided, selects one at random, 
    converts it into .ico format, saves it to the appropriate directory
    and returns the file.
    """
    query_str = '+'.join(srch_str.split())
    url = "https://www.google.co.in/search?q=" + query_str + \
          "&source=lnt&tbm=isch&tbs=isz:ex,iszw:256,iszh:256"
    req = urllib2.Request(url, headers=REQUEST_HEADER)
    soup = BeautifulSoup(
        urllib2.urlopen(req, timeout=200).read(), 'html.parser')

    img_arr = []
    for a_tag in soup.find_all("div", {"class": "rg_meta"}):
        img_link = json.loads(a_tag.text)["ou"]
        try:
            opener = urllib2.URLopener()
            opener.addheaders = [('User-Agent', REQUEST_HEADER['User-Agent']),
                                 ('Accept', REQUEST_HEADER['Accept']),
                                 ('Accept-Language',
                                  REQUEST_HEADER['Accept-Language']),
                                 ('Connection', REQUEST_HEADER['Connection'])]
            opener.retrieve(img_link)
        except urllib2.HTTPError:
            # urllib2.HTTPError: HTTP Error 403: Forbidden
            continue

        img_arr.append(img_link)
        if len(img_arr) == IMAGE_DOWNLOAD_LIMIT:
            break

    img_choice = random.choice(img_arr) if (len(img_arr) != 0) else None
    if not img_choice:
        return ""

    req = urllib2.Request(img_choice, headers=REQUEST_HEADER)
    img = Image.open(io.BytesIO(urllib2.urlopen(req, timeout=200).read()))
    img = img.convert("RGB")
    ico_file_name = os.path.join(os.path.join(root, srch_str),
                                 srch_str + ".ico")
    img.save(ico_file_name)
    return ico_file_name
def download_data_external(URL, data_format):
    """  This function is used to download external data into the local machine
    
    Args:
        URL(str): the url that points the external data. 
        data_format(str): the format user specified during boot time, needed for uncompress. 

    Returns:
        None, the file will be downloaded to local disk
    """
    connector = urllib.URLopener()
    if data_format == 'gz':
        connector.retrieve(URL, './cvdata.gz')
    elif data_format == 'zip':
        connector.retrieve(URL, './cvdata.zip')
    elif data_format == 'tar':
        connector.retrieve(URL, './cvdata.tar')
    elif data_format == 'uncompressed':
        connector.retrieve(URL, './cvdata/')
Пример #19
0
def scraping(url):
	r = requests.get(url)
	soup = BeautifulSoup(r.text,'html.parser')
	
	for table in soup.find_all('table', attrs={'class':'detail-text'}):
		for tr in table.find_all('tr'):
			try:
				if tr.find_all('td')[1].find('a'):
				#if tr.find_all('td')[1].find('a', attrs={'href':re.compile('MIDIFiles')}):			
					link = tr.find_all('td')[1].find('a').get('href')
					year = url[-8:-4]
					filename = link.split('/')[-1]
					title = tr.find_all('td')[1].text
					composer = tr.find_all('td')[0].text
					print(filename, composer, year, title)
					downloadfile = ur.URLopener()
					downloadfile.retrieve('http://www.piano-e-competition.com'+link, filename)
					f.writerow([filename, composer, year, title])
			except:
				continue
Пример #20
0
def lambda_handler(event, context):
    file = url.URLopener()
    try:
        file.retrieve("http://www.bogc.dnrc.mt.gov/production/historical.zip",
                      TMP_FILE)

        with ZipFile(TMP_FILE) as zip:
            file_leaseProd = zip.read('histLeaseProd.tab')
            file_wellProd = zip.read('histprodwell.tab')
            file_wellData = zip.read('histWellData.tab')

            s3 = boto3.resource('s3')
            s3.Bucket(BUCKET_NAME).put_object(Key='MT_leaseProd.tab',
                                              Body=file_leaseProd)
            s3.Bucket(BUCKET_NAME).put_object(Key='MT_wellProd.tab',
                                              Body=file_wellProd)
            s3.Bucket(BUCKET_NAME).put_object(Key='MT_wellData.tab',
                                              Body=file_wellData)
    except Exception as e:
        print(e)
        raise e
Пример #21
0
def load(url, file_name, folder):
    """ Download archive for a StackExchange site and unzip it,
    skipping either or both if the neessary tables are already available """
    # Need special case for Stack Overflow (more than one 7z file)

    if not os.path.isfile(file_name):
        #downloads file from url; two url patterns are attempted
        testfile = request.URLopener()
        try:
            testfile.retrieve(url[0], file_name)
        except error.HTTPError as e:
            try:
                testfile.retrieve(url[1], file_name)
            except:
                print ("Error: URL retrieval of " + url[0] + " and " + url[1] + " failed for reason: " + e.reason)
                quit()

    #un-zips file and puts contents in folder
    a = py7z_extractall.un7zip(file_name)
    if not (os.path.isfile(os.path.join(folder, "PostLinks.xml")) and os.path.isfile(os.path.join(folder, "Posts.xml"))):
        a.extractall(folder)
def retrieveLidar(x1, y1, x2, y2):
    xmin = min(x1, x2)
    ymin = min(y1, y2)

    xmax = max(x1, x2)
    ymax = max(y1, y2)

    lastblok = 10

    for x in range(xmin, xmax + 1):
        for y in range(ymin, ymax + 1):
            print(x, y)
            b = lastblok
            # http://gis.arso.gov.si/lidar/otr/laz/b_22/D48GK/GKR_504_107.laz
            urladdr = "http://gis.arso.gov.si/lidar/otr/laz/b_{2}/D48GK/GKR_{0}_{1}.laz".format(
                x, y, b)
            if urlExists(urladdr):
                urlE = True
                print("Found!")
            else:
                b = 9
                urlE = False

                while b < 100 and urlE == False:
                    b = b + 1
                    urladdr = "http://gis.arso.gov.si/lidar/otr/laz/b_{2}/D48GK/GKR_{0}_{1}.laz".format(
                        x, y, b)
                    urlE = urlExists(urladdr)

                if urlE == True:
                    lastblok = b
                    print("Found!")
                else:
                    print("\n\n\nFile NOT FOUND!\n\n\n")

            # download
            if (urlE == True):
                print("Retrieving file ...")
                downfile = url.URLopener()
                downfile.retrieve(urladdr, "raw/GK_{0}_{1}.laz".format(x, y))
Пример #23
0
def retrieve_pics(photos: list) -> list:
    """
    Downloads photo from absolute url retrieved from `get_all_info()`
    function. This also added file name details into PhotoObjects and
    returns same list with updated parameter :param photos: List of all
    PhotoObjects :return: Same list with `file_name' field updated
    """
    file_counter = 1  # Counter for files
    if not os.path.exists(photo_store_folder):
        # Make folder if it does not exists
        os.makedirs(photo_store_folder)
    if not os.path.exists(winner_email_folder):
        # Make folder if it does not exists
        os.makedirs(winner_email_folder)

    for p2 in photos:
        # Create file name Besides prefix, add little bit of title. Remove
        # all the special characters from title and use string sequence
        count_text = str(file_counter)
        if len(count_text) == 1:
            count_text = "0" + count_text
        filename = photo_prefix + "%s_%s" % (
            count_text, re.sub('[^A-Za-z0-9]+', '', p2.title))
        filename = filename[:20] + ".jpg"  # If name if more than 20 characters,

        # if for some reason file does not get downloaded, warn and continue.
        # Required for Hippo integration.
        try:
            # strip it and add file extension
            p2.file_name = filename  # Update file_name field in the PhotoObject
            testfile = request.URLopener()  # start downloading
            testfile.retrieve(p2.photo_url,
                              photo_store_folder + filename)  # Save
            file_counter += 1
        except Exception as e:
            logging.warn("Failed to download %s due to %s" % (p2.photo_url, e))

    return photos
import urllib.request as urllib
import os
from PIL.Image import core as Image
number = 0
with open("imagenet.synset.txt") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
os.chdir('./file')
for i in content:
    number = number + 1
    name = str(str(number) + ".jpg")
    try:
        url = str(i)
        image = urllib.URLopener()
        image.retrieve(url, name)
        im = open(name)
        im.save(name, dpi=(600, 600))
    except IOError:
        continue
Пример #25
0
 def _download(self, fpath, link):
     print("Downloading from '{}' to '{}'".format(link,fpath))
     urllib.URLopener().retrieve(link, fpath)
Пример #26
0
def extractincidents(List):

    data = ur.urlopen(List[0])
    testfile = ur.URLopener()
    testfile.retrieve(List[0], "file.pdf")
    #Create the df for allArrests...we just want the structure of whatever is in there, 
    #so make a df with that form and then delete all the data in it
    df = read_pdf("file.pdf", flavor = 'stream', columns=['112,162,241,342,425,465,525,570,599,634,685'],split_text=True,pages='1')
    allArrests = df[0].df
    #header = allArrests.iloc[2,:]
    #header[5] = 'Arestee Birthday'
    header = ['arrest_time','case_number','arrest_location','offense','arrestee_name','arrestee_birthday','arrestee_address','City','State','Zip','status','officer']
    allArrests.drop(allArrests.index, inplace=True)
#Loop through all URLs found in the list 
    for urlNum in range(0,len(List),1):
#    for urlNum in range(0,1,1):

    #Open a URL in the list of URLs
        data = ur.urlopen(List[urlNum])
        testfile = ur.URLopener()
        testfile.retrieve(List[urlNum], "file.pdf")
    #Create a temporary file for pdfReader
        fp = tempfile.TemporaryFile()
        fp.write(data.read())
        fp.seek(0)
    #Extract the number of pages in the PDF
        pdfReader = PdfFileReader(fp)
        pages = PdfFileReader(fp).getNumPages()

    #An alternative method of parsing the PDF is below
    #    page1 = pdfReader.getPage(0).extractText()

    #    content = ""

    #    for i in range (0, pdfReader.getNumPages()):
    #        extractedText = pdfReader.getPage(i).extractText()
    #        content += extractedText + "\n"

    #    content = " ".join(content.replace("\xa0", " ").strip().split())

        for pageNum in range(1,pages+1,1):
#Use CAMELOT to parse the table. In this case, the parsing isn't prefect, however: there are no lines in the table for the lattice method to look for, and text spans multiple rows.
#And it gets worse! CAMELOT attempts to define rows by looking for consistent edges in texts, and the tables given run text so close together that CAMELOT
#thinks they're one column. So, visually debugging was done by extracting column pixel positions from a plot.
            df = read_pdf("file.pdf", flavor = 'stream', columns=['112,162,241,342,425,465,525,570,599,634,685'],split_text=True,pages=str(pageNum))
            print('Now parsing page: ' + str(pageNum) + ' on PDF number: ' + str(urlNum+1) + ' out of ' + str(len(List)))
        
        #CAMELOT does a good job, but returns spanning text above and below the record. The following code:
        #*finds the blanks in a record
        #*fills the blanks with the record above and below
        #*continues until there are no more inappropriate NAs
            nadf = df[0].df.replace('',np.nan)
            tocat = np.where(nadf.notna().iloc[:,0])[0]
            for i in tocat[::-1]:
                if (i+1 in df[0].df.index and i-1 in df[0].df.index and nadf.notna().iloc[i+1,0]==False and nadf.notna().iloc[i-1,0]==False):
                    #concatenate above and below
                    df[0].df.iloc[i,:]=df[0].df.iloc[i-1,:]+' '+df[0].df.iloc[i,:]+df[0].df.iloc[i+1,:]
                    #Strip off unnecessary spaces
                    df[0].df.iloc[i,:]=df[0].df.iloc[i,:].map(lambda x: x.strip())
                    #Remove the concatenated rows
                    df[0].df=df[0].df.drop([i-1,i+1],axis=0)

            df[0].df = df[0].df.reset_index(drop=True)
        #Second loop for more blank lines
            nadf = df[0].df.replace('',np.nan)
            tocat = np.where(nadf.notna().iloc[:,0])[0]
            for i in tocat[::-1]:
                if (i+1 in df[0].df.index and i-1 in df[0].df.index and nadf.notna().iloc[i+1,0]==False and nadf.notna().iloc[i-1,0]==False):
                    df[0].df.iloc[i,:]=df[0].df.iloc[i-1,:]+' '+df[0].df.iloc[i,:]+' '+df[0].df.iloc[i+1,:]
                    df[0].df.iloc[i,:]=df[0].df.iloc[i,:].map(lambda x: x.strip())
                    df[0].df=df[0].df.drop([i-1,i+1],axis=0)

            df[0].df = df[0].df.reset_index(drop=True)

        #Drop the header and any rows that don't contain data
            nadf = df[0].df.replace('',np.nan)
            tocat = np.where(nadf.isna().iloc[:,0])[0]
            for i in tocat[::-1]:
                    df[0].df=df[0].df.drop([i],axis=0)

            df[0].df = df[0].df[~df[0].df[0].str.contains('Arrest')]
            df[0].df = df[0].df.reset_index(drop=True)

        #Append the current page to the growing dataframe of all arrests held at the Norman splash page
            allArrests = allArrests.append(df[0].df)
    #Format the columns to match what's required in the key for the SQLite database
    allArrests.columns = header
    allArrests['arrestee_address'] = allArrests['arrestee_address'] + ' ' + allArrests['City'] + ' ' + allArrests['State'] + ' ' + allArrests['Zip']
    allArrests['arrestee_address'] = allArrests['arrestee_address'].map(lambda x: x.strip())
    allArrests.drop(['City','State','Zip'], axis=1, inplace = True)
#Prepend the header to the dataframe of all arrests at the Norman splash page
#    allArrests = allArrests.reset_index(drop = True)
#    allArrests.loc[-1] = header
#    allArrests.index = allArrests.index + 1
#    allArrests = allArrests.sort_index()

#Output every arrest on the Norman police page as a CSV; used for testing and debugging
    allArrests.to_csv("file.csv")

    return allArrests
Пример #27
0
import urllib.request as ur
import os

testfile = ur.URLopener()
#import httplib
read_file = open('wanted.txt', 'r')
raw_lists = read_file.read()
urls = []

read_splited = raw_lists.split('<')
#print(read_splited)
for i in range(1, len(read_splited)):
    urls.append("http://222.236.46.45" + read_splited[i].split('>')[0])

#headers = {'User-agent': 'Python'}
#conn = httplib.HTTPConnection('222.236.46.45')
drout = 'Downloads'
if not os.path.exists(drout):
    os.makedirs(drout)

k = 0
while (k < len(urls)):
    try:  # This enables us to try downloading again if temporary network error occurs.
        temp = urls[k].split('/')
        filename = urls[k].split('/')[len(temp) - 1]
        print('Downloading ' + filename + '...')
        # conn.request('GET', urls[k], '', headers)
        # resp = conn.getresponse()
        # image = resp.read()
        # f = open('Downloads/' + filename, 'wb')
        # f.write(image)
Пример #28
0
def get_media_story(user_to_check, user_id, ig_client):
    try:
        try:
            feed = ig_client.user_story_feed(user_id)
        except Exception as e:
            print("[W] An error occurred: " + str(e))
            exit(1)

        try:
            feed_json = feed['reel']['items']
        except TypeError as e:
            print("[I] There are no recent stories to process for this user.")
            return

        list_video = []
        list_image = []

        list_video_new = []
        list_image_new = []

        for media in feed_json:
            if 'video_versions' in media:
                list_video.append(media['video_versions'][0]['url'])
            if 'image_versions2' in media:
                list_image.append(
                    media['image_versions2']['candidates'][0]['url'])

        for video in list_video:
            filename = video.split('/')[-1]
            final_filename = filename.split('.')[0] + ".mp4"
            save_path = os.getcwd() + "/stories/{}/".format(
                user_to_check) + final_filename
            if not os.path.exists(save_path):
                print("[I] Downloading video: {:s}".format(final_filename))
                try:
                    urllib.URLopener().retrieve(video, save_path)
                    list_video_new.append(save_path)
                except Exception as e:
                    print("[W] An error occurred: " + str(e))
                    exit(1)
            else:
                print("[I] Story already exists: {:s}".format(final_filename))

        for image in list_image:
            filename = (image.split('/')[-1]).split('?', 1)[0]
            final_filename = filename.split('.')[0] + ".jpg"
            save_path = os.getcwd() + "/stories/{}/".format(
                user_to_check) + final_filename
            if not os.path.exists(save_path):
                print("[I] Downloading image: {:s}".format(final_filename))
                try:
                    urllib.URLopener().retrieve(image, save_path)
                    list_image_new.append(save_path)
                except Exception as e:
                    print("[W] An error occurred: " + str(e))
                    exit(1)
            else:
                print("[I] Story already exists: {:s}".format(final_filename))

        if (len(list_image_new) != 0) or (len(list_video_new) != 0):
            print('-' * 70)
            print("[I] Story downloading ended with " +
                  str(len(list_image_new)) + " new images and " +
                  str(len(list_video_new)) + " new videos downloaded.")
        else:
            print('-' * 70)
            print("[I] No new stories were downloaded.")
    except Exception as e:
        print("[E] An error occurred: " + str(e))
        exit(1)
    except KeyboardInterrupt as e:
        print("[I] User aborted download.")
        exit(1)
Пример #29
0
    def scrape(self, limit=-1, start=0):
        self.offset = start
        t0 = time.time()
        # url = self.url +"&max_results="+str(limit)+"&start="+str(start)
        url = self.url
        
        sys.stdout.flush()
        ds = []
        k = 0

        while True:
            sys.stdout.flush()
            try:
                if time.time() - t0 > 60:
                    print("socket timed out")
                    raise
                    # return []
                print("fetching: ", start, "/", limit, url, "proxy:", self.proxy, self.proxy_protocol)
                # req = urlrequest.Request(url)
                # if self.proxy is not None and self.proxy is not "":
                    # req.set_proxy(self.proxy, self.proxy_protocol)
                # response = urlrequest.urlopen(req)
                response = urlrequest.URLopener(proxies=self.proxies).open(url)
                # urllib.urlopen(url, proxies=self.proxies)
                # response = urlopen(url)
            except socket.error as e:
                print("socker error, retrying...")
                time.sleep(2)
                continue
            except HTTPError as e:
                if e.code == 503:
                    to = int(e.hdrs.get('retry-after', 30))
                    print('Got 503. Retrying after {0:d} seconds.'.format(self.t))
                    time.sleep(to)
                    continue
                else:
                    raise

            xml = response.read()
            root = ET.fromstring(xml)
            hasError = root.findall("error")
            # print("has error? "+str(len(hasError)))
            if len(hasError) > 0:
                print("has error: "+xml.decode("utf-8"))
                raise "error xml"
            # print("xml:"+xml.decode("utf-8"))

            records = root.findall(OAI + 'ListRecords/' + OAI + 'record')
            print("records: ", len(records), "k", k)
            sys.stdout.flush()
            if k <= start+len(records):
                for record in records:
                    meta = record.find(OAI + 'metadata').find(ARXIV + 'arXiv')
                    record = Record(meta).output()
                    if k >= start and (limit == -1 or k < start+limit):
                        if self.append_all:
                            ds.append(record)
                        else:
                            save_record = False
                            for key in self.keys:
                                for word in self.filters[key]:
                                    if word.lower() in record[key]:
                                        save_record = True

                            if save_record:
                                ds.append(record)
                    k +=1
                    if limit >= 0 and k >= start+limit:
                        break# skip after max reached
                
                listRecords = root.find(OAI + 'ListRecords')
                if listRecords is None:
                    print("ListRecords not found", xml.decode("utf-8"))
                    sys.stdout.flush()
                    return ds
            else:
                print("skipping", len(records))
                k += len(records)# skipped

            if limit >= 0 and k + 1 > start+limit:
                print("reached limit", k+1, start+limit)
                sys.stdout.flush()
                break

            token = listRecords.find(OAI + 'resumptionToken')
            if token is None or token.text is None:
                self.nextUrl = ""
                break
            else:
                url = BASE + 'resumptionToken=%s' % token.text
                self.nextUrl = url

            if k >= start:
                break# use next to continue
        
        self.offset += k
        # end while
        t1 = time.time()
        print('fetching is completes in {0:.1f} seconds.'.format(t1 - t0), "offset:", self.offset)
        sys.stdout.flush()
        return ds
Пример #30
0
    def next(self, limit=-1):
        t0 = time.time()
        sys.stdout.flush()
        ds = []
        k = 0

        while True:
            print("continue fetch: ", self.offset, " for ", limit, self.nextUrl, "proxy:", self.proxy, self.proxy_protocol)
            sys.stdout.flush()
            try:
                if time.time() - t0 > 60:
                    print("socket timed out")
                    raise
                # req = urlrequest.Request(self.nextUrl)
                # if self.proxy is not None and self.proxy is not "":
                #     req.set_proxy(self.proxy, self.proxy_protocol)
                # response = urlrequest.urlopen(req)
                # response = requests.get(self.nextUrl, proxies=self.proxies)
                # response = urlrequest.urlopen(self.nextUrl, proxies=self.proxies)
                response = urlrequest.URLopener(proxies=self.proxies).open(self.nextUrl)

                # response = urlopen(self.nextUrl)
                if 1==1:
                    break
            except HTTPError as e:
                if e.code == 503:
                    to = int(e.hdrs.get('retry-after', 30))
                    print('Got 503. Retrying after {0:d} seconds.'.format(self.t))
                    time.sleep(to)
                    continue
                else:
                    raise

        xml = response.read()
        root = ET.fromstring(xml)
        hasError = root.findall("error")
        # print("has error? "+str(len(hasError)))
        if len(hasError) > 0:
            print("has error: "+xml.decode("utf-8"))
            raise "error xml"
        # print("xml:"+xml.decode("utf-8"))

        records = root.findall(OAI + 'ListRecords/' + OAI + 'record')
        print("records: ", len(records), "k", k)
        sys.stdout.flush()
        for record in records:
            meta = record.find(OAI + 'metadata').find(ARXIV + 'arXiv')
            record = Record(meta).output()
            if (limit == -1 or k < limit):
                if self.append_all:
                    ds.append(record)
                else:
                    save_record = False
                    for key in self.keys:
                        for word in self.filters[key]:
                            if word.lower() in record[key]:
                                save_record = True

                    if save_record:
                        ds.append(record)
            k +=1
            if limit >= 0 and k >= limit:
                break# skip after max reached
        
        listRecords = root.find(OAI + 'ListRecords')
        if listRecords is None:
            print("ListRecords not found", xml.decode("utf-8"))
            sys.stdout.flush()
            return ds

        if limit >= 0 and k + 1 > limit:
            print("reached limit", k+1, limit)
            sys.stdout.flush()
            self.nextUrl = ""
        else:
            print("getting next token")
            token = listRecords.find(OAI + 'resumptionToken')
            if token is None or token.text is None:
                self.nextUrl = ""
            else:
                url = BASE + 'resumptionToken=%s' % token.text
                self.nextUrl = url

        print("next size: ", len(ds))
        self.offset += k
        # end while
        t1 = time.time()
        print('next completes in {0:.1f} seconds, offset: '.format(t1 - t0), self.offset)
        sys.stdout.flush()
        return ds