예제 #1
0
    def __init__(self, data_dir, train_data_url, test_data_url, columns,
                 label_column, categorical_columns, continuous_columns):
        """Constructor of CensusDataSource.

    Args:
      data_dir: Directory to save/load the data files
      train_data_url: URL from which the training data can be downloaded
      test_data_url: URL from which the test data can be downloaded
      columns: Columns to retrieve from the data files (A list of strings)
      label_column: Name of the label column
      categorical_columns: Names of the categorical columns (A list of strings)
      continuous_columns: Names of the continuous columns (A list of strings)
    """

        # Retrieve data from disk (if available) or download from the web.
        train_file_path = os.path.join(data_dir, "adult.data")
        if os.path.isfile(train_file_path):
            print("Loading training data from file: %s" % train_file_path)
            train_file = open(train_file_path)
        else:
            urllib.urlretrieve(train_data_url, train_file_path)

        test_file_path = os.path.join(data_dir, "adult.test")
        if os.path.isfile(test_file_path):
            print("Loading test data from file: %s" % test_file_path)
            test_file = open(test_file_path)
        else:
            test_file = open(test_file_path)
            urllib.urlretrieve(test_data_url, test_file_path)

        # Read the training and testing data sets into Pandas DataFrame.
        import pandas  # pylint: disable=g-import-not-at-top
        self._df_train = pandas.read_csv(train_file,
                                         names=columns,
                                         skipinitialspace=True)
        self._df_test = pandas.read_csv(test_file,
                                        names=columns,
                                        skipinitialspace=True,
                                        skiprows=1)

        # Remove the NaN values in the last rows of the tables
        self._df_train = self._df_train[:-1]
        self._df_test = self._df_test[:-1]

        # Apply the threshold to get the labels.
        income_thresh = lambda x: ">50K" in x
        self._df_train[label_column] = (
            self._df_train["income_bracket"].apply(income_thresh)).astype(int)
        self._df_test[label_column] = (
            self._df_test["income_bracket"].apply(income_thresh)).astype(int)

        self.label_column = label_column
        self.categorical_columns = categorical_columns
        self.continuous_columns = continuous_columns
예제 #2
0
  def __init__(self, data_dir, train_data_url, test_data_url,
               columns, label_column,
               categorical_columns, continuous_columns):
    """Constructor of CensusDataSource.

    Args:
      data_dir: Directory to save/load the data files
      train_data_url: URL from which the training data can be downloaded
      test_data_url: URL from which the test data can be downloaded
      columns: Columns to retrieve from the data files (A list of strings)
      label_column: Name of the label column
      categorical_columns: Names of the categorical columns (A list of strings)
      continuous_columns: Names of the continuous columsn (A list of strings)
    """

    # Retrieve data from disk (if available) or download from the web.
    train_file_path = os.path.join(data_dir, "adult.data")
    if os.path.isfile(train_file_path):
      print("Loading training data from file: %s" % train_file_path)
      train_file = open(train_file_path)
    else:
      urllib.urlretrieve(train_data_url, train_file_path)

    test_file_path = os.path.join(data_dir, "adult.test")
    if os.path.isfile(test_file_path):
      print("Loading test data from file: %s" % test_file_path)
      test_file = open(test_file_path)
    else:
      test_file = open(test_file_path)
      urllib.urlretrieve(test_data_url, test_file_path)

    # Read the training and testing data sets into Pandas DataFrame.
    import pandas  # pylint: disable=g-import-not-at-top
    self._df_train = pandas.read_csv(train_file, names=columns,
                                     skipinitialspace=True)
    self._df_test = pandas.read_csv(test_file, names=columns,
                                    skipinitialspace=True, skiprows=1)

    # Remove the NaN values in the last rows of the tables
    self._df_train = self._df_train[:-1]
    self._df_test = self._df_test[:-1]

    # Apply the threshold to get the labels.
    income_thresh = lambda x: ">50K" in x
    self._df_train[label_column] = (
        self._df_train["income_bracket"].apply(income_thresh)).astype(int)
    self._df_test[label_column] = (
        self._df_test["income_bracket"].apply(income_thresh)).astype(int)

    self.label_column = label_column
    self.categorical_columns = categorical_columns
    self.continuous_columns = continuous_columns
예제 #3
0
def maybe_download(directory, filename, url):
  """Download filename from url unless it's already in directory.

  Args:
    directory: path to the directory that will be used.
    filename: name of the file to download to (do nothing if it already exists).
    url: URL to download from.

  Returns:
    The path to the downloaded file.
  """
  if not tf.gfile.Exists(directory):
    tf.logging.info("Creating directory %s" % directory)
    os.mkdir(directory)
  filepath = os.path.join(directory, filename)
  if not tf.gfile.Exists(filepath):
    tf.logging.info("Downloading %s to %s" % (url, filepath))
    inprogress_filepath = filepath + ".incomplete"
    inprogress_filepath, _ = urllib.urlretrieve(
        url, inprogress_filepath, reporthook=download_report_hook)
    # Print newline to clear the carriage return from the download progress
    print()
    tf.gfile.Rename(inprogress_filepath, filepath)
    statinfo = os.stat(filepath)
    tf.logging.info("Successfully downloaded %s, %s bytes." %
                    (filename, statinfo.st_size))
  else:
    tf.logging.info("Not downloading, file already found: %s" % filepath)
  return filepath
예제 #4
0
def download_evtracks(fehs=[-2.5,-2.0,-1.5,-1.0,-0.5,0.0, 0.15, 0.3, 0.5],
                      afe=0., phot_system='sdss'):
    import urllib

    urlbase = 'http://stellar.dartmouth.edu/models/tracks/{}/'.format(phot_system)

    for feh in fehs:
        print('Fetching evolution tracks for feh={}...'.format(feh))
        feh_sign = 'p' if feh >= 0 else 'm'
        afe_sign = 'p' if afe >= 0 else 'm'
        filename = 'feh{}{:02.0f}afe{}{:01.0f}_{}.tgz'.format(feh_sign,abs(feh*10),
                                                            afe_sign,abs(afe*10),
                                                            phot_system)
        url = urlbase+filename

        folder = os.path.join(DATADIR, 'dartmouth')
        if not os.path.exists(folder):
            os.makedirs(folder)
        localfile = os.path.join(folder,filename)
        if not os.path.exists(localfile):
            urllib.urlretrieve(url,localfile)
예제 #5
0
파일: run.py 프로젝트: HugoPu/word2vec
def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename +
                        '. Can you get to it with a browser?')
    return filename
예제 #6
0
def download_evtracks(fehs=[-2.5, -2.0, -1.5, -1.0, -0.5, 0.0, 0.15, 0.3, 0.5],
                      afe=0.,
                      phot_system='sdss'):
    import urllib

    urlbase = 'http://stellar.dartmouth.edu/models/tracks/{}/'.format(
        phot_system)

    for feh in fehs:
        print('Fetching evolution tracks for feh={}...'.format(feh))
        feh_sign = 'p' if feh >= 0 else 'm'
        afe_sign = 'p' if afe >= 0 else 'm'
        filename = 'feh{}{:02.0f}afe{}{:01.0f}_{}.tgz'.format(
            feh_sign, abs(feh * 10), afe_sign, abs(afe * 10), phot_system)
        url = urlbase + filename

        folder = os.path.join(DATADIR, 'dartmouth')
        if not os.path.exists(folder):
            os.makedirs(folder)
        localfile = os.path.join(folder, filename)
        if not os.path.exists(localfile):
            urllib.urlretrieve(url, localfile)
예제 #7
0
def download_and_extract():
    dest_directory = config.raw_path
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = config.url.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):
        def _progress(count, block_size, total_size):
            sys.stdout.write('\rDownloading %s %.2f%%' % (filename,
                                                          float(count * block_size) / float(total_size) * 100.0))
            sys.stdout.flush()

        filepath, _ = urllib.urlretrieve(config.url, filepath, reporthook=_progress)
        print('Downloaded', filename)
        tarfile.open(filepath, 'r:gz').extractall(dest_directory)
예제 #8
0
def maybe_download_and_extract():
    """Download and extract model tar file."""
    dest_directory = FLAGS.model_dir
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):

        def _progress(count, block_size, total_size):
            sys.stdout.write('\r>> Downloading %s %.1f%%' %
                             (filename, float(count * block_size) /
                              float(total_size) * 100.0))
            sys.stdout.flush()

        filepath, _ = urllib.urlretrieve(DATA_URL, filepath, _progress)
        print()
        statinfo = os.stat(filepath)
        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
예제 #9
0
def crop_by_loc(input, img_id, r1, r2, r3, r4):
    im = Image.open(input)
    imgwidth, imgheight = im.size
    box = (r1, r2, r3, r4)
    a = im.crop(box)
    print(a)
    try:
        a.save("IMG-%d.jpg" % img_id)
    except:
        print("There is an error")


data = json.load(open('../../carparks.db'))
anu_car_park = data["carparks"][0]

urllib.urlretrieve(
    "http://maps.googleapis.com/maps/api/streetview?size=600x300&location=" +
    anu_car_park["location"] + "&pitch=" + anu_car_park["pitch"] + "&key=" +
    data["key"], "1.1.jpg")

print(
    "http://maps.googleapis.com/maps/api/streetview?size=600x    300&location="
    + anu_car_park["location"] + "&pitch=" + anu_car_park["pitch"] + "&key=" +
    data["key"], "1.1.jpg")

crop_by_loc("1.1.jpg", 1, 70, 150, 170, 250)
crop_by_loc("1.1.jpg", 2, 170, 150, 270, 250)
crop_by_loc("1.1.jpg", 3, 270, 150, 370, 250)
crop_by_loc("1.1.jpg", 4, 370, 202, 470, 302)
예제 #10
0
images = []
for img in page.findAll('img'):
    if 'li=' not in img['src']:
        images.append(img['src'])

i = 0
for img in images:
    path = io.BytesIO(urllib.urlopen(img).read())
    path.seek(0, 2)  # 0 bytes from the end
    size = path.tell()

    if size < 7000:
        continue

    path = os.path.join('./images', '%s.jpg' % i)
    response = urllib.urlretrieve(img, path)
    i += 1

r = redis.StrictRedis(host='localhost', port=6379, db=0)

FLAGS = None

# pylint: disable=line-too-long
DATA_URL = "http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz"
# pylint: enable=line-too-long


class NodeLookup(object):
    """Converts integer node ID's to human readable labels."""
    def __init__(self, label_lookup_path=None, uid_lookup_path=None):
        if not label_lookup_path:
def get_collection_tags2(image_collection_links,
                         threshold=0.8,
                         current_dict=None):
    """
    Returns a counter dict representing the features for the given list of image urls. The features will be used to
    classify the collection to an event. The dict includes the following fields (the order doesn't matter):

    Parameters
    ----------
    image_collection : list[str]
        List of image file paths (can be relative or absolute)
    rel_threshold:float:
        only take tags that have a probability of at least this fraction of the first result

    Returns
    -------
    Dict[dict[str:count] or str:str]
        3 features for each collection.

    """
    import urllib

    filename = "/home/jessica/Documents/temp.jpg"
    if (current_dict == None):
        ans = defaultdict(float)
    else:
        ans = current_dict
    err_count = 0

    # Creates graph from saved GraphDef.
    create_graph2()

    for i, img_link in enumerate(image_collection_links):
        if (img_link.endswith("png")):  #skip items that are png images
            print(
                "###########################################################")
            print("Error: picture is a png (need jpg)")
            print(
                "###########################################################")
            continue
        #get url
        try:
            print("{}: retreiving url({})".format(i, img_link))
            urllib.urlretrieve(img_link, filename)
            print("retrieval complete!")
        except:
            print(
                "###########################################################")
            print("Error: URL is not retreivalbe")
            print(
                "###########################################################")
            continue

        print("now processing...")

        try:
            image_data = tf.gfile.FastGFile(filename, 'rb').read()

            with tf.Session() as sess:
                # Some useful tensors:
                # 'softmax:0': A tensor containing the normalized prediction across
                #   1000 labels.
                # 'pool_3:0': A tensor containing the next-to-last layer containing 2048
                #   float description of the image.
                # 'DecodeJpeg/contents:0': A tensor containing a string providing JPEG
                #   encoding of the image.
                # Runs the softmax tensor by feeding the image_data as input to the graph.
                softmax_tensor = sess.graph.get_tensor_by_name(
                    'final_result:0')
                predictions = sess.run(softmax_tensor,
                                       {'DecodeJpeg/contents:0': image_data})
                predictions = np.squeeze(predictions)

                temp_tags = []
                top_k = predictions.argsort()[-num_top_predictions:][::-1]
                for node_id in top_k:
                    score = predictions[node_id]
                    temp_tags.append((node_id, score))

        except:
            print("ERROR ###################################")
            err_count += 1
            print("Could not give tags to image.")
            print("ERROR ###################################")
            continue

        #keep only the tags that overcome the certain threshold
        print("tags = [{}]".format(temp_tags))

        for tag_score in temp_tags:
            if (tag_score[0] != 2 and tag_score[1] > threshold):
                ans[special_nodes[tag_score[0]]] += 1
                print("\tadded " + special_nodes[tag_score[0]])
    print("Final Error Count:", err_count)
    return ans
def get_collection_tags2(image_collection_links, threshold = 0.8, current_dict = None):
    """
    Returns a counter dict representing the features for the given list of image urls. The features will be used to
    classify the collection to an event. The dict includes the following fields (the order doesn't matter):

    Parameters
    ----------
    image_collection : list[str]
        List of image file paths (can be relative or absolute)
    rel_threshold:float:
        only take tags that have a probability of at least this fraction of the first result

    Returns
    -------
    Dict[dict[str:count] or str:str]
        3 features for each collection.

    """
    import urllib
    
    filename ="/home/jessica/Documents/temp.jpg"
    if(current_dict == None):
        ans = defaultdict(float)
    else:
        ans = current_dict
    err_count = 0
    
    # Creates graph from saved GraphDef.
    create_graph2()
            
    for i, img_link in enumerate(image_collection_links):
        if(img_link.endswith("png")): #skip items that are png images
            print("###########################################################")
            print("Error: picture is a png (need jpg)")
            print("###########################################################")  
            continue
        #get url
        try:
            print("{}: retreiving url({})".format(i, img_link))
            urllib.urlretrieve(img_link, filename)
            print("retrieval complete!")
        except:
            print("###########################################################")
            print("Error: URL is not retreivalbe")
            print("###########################################################")            
            continue
        
        print("now processing...")
        
        
        try:
            image_data = tf.gfile.FastGFile(filename, 'rb').read()
            
            with tf.Session() as sess:
                # Some useful tensors:
                # 'softmax:0': A tensor containing the normalized prediction across
                #   1000 labels.
                # 'pool_3:0': A tensor containing the next-to-last layer containing 2048
                #   float description of the image.
                # 'DecodeJpeg/contents:0': A tensor containing a string providing JPEG
                #   encoding of the image.
                # Runs the softmax tensor by feeding the image_data as input to the graph.
                softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
                predictions = sess.run(softmax_tensor,
                                       {'DecodeJpeg/contents:0': image_data})
                predictions = np.squeeze(predictions)
            
                temp_tags = []
                top_k = predictions.argsort()[-num_top_predictions:][::-1]
                for node_id in top_k:
                    score = predictions[node_id]
                    temp_tags.append((node_id, score))
            
        except:
            print("ERROR ###################################")
            err_count +=1
            print("Could not give tags to image.")
            print("ERROR ###################################")
            continue
        
        #keep only the tags that overcome the certain threshold
        print("tags = [{}]".format(temp_tags))
                
        for tag_score in temp_tags:
            if(tag_score[0] != 2 and tag_score[1] > threshold):
                ans[special_nodes[tag_score[0]]] += 1 
                print("\tadded " + special_nodes[tag_score[0]])
    print("Final Error Count:", err_count)
    return ans
예제 #13
0
파일: utils.py 프로젝트: wanliu2019/SCREEN
    def download(url, fnp, auth=None, force=False,
                 file_size_bytes=0, skipSizeCheck=None,
                 quiet=False, umask=FileUmask):
        Utils.ensureDir(fnp)
        fn = os.path.basename(fnp)
        if not skipSizeCheck:
            if 0 == file_size_bytes:
                fsb = Utils.getHttpFileSizeBytes(url, auth)
                if fsb:
                    file_size_bytes = fsb
            Utils.deleteFileIfSizeNotMatch(fnp, file_size_bytes)

        if os.path.exists(fnp):
            if force:
                os.remove(fnp)
            else:
                return True

        Utils.quietPrint(quiet, "downloading", url, "...")

        if url.startswith("ftp://"):
            fnpTmp = urllib.urlretrieve(url)[0]
            shutil.move(fnpTmp, fnp)
            # chmod g+w
            st = os.stat(fnp)
            os.chmod(fnp, st.st_mode | umask)
            return True

        if not auth:
            r = requests.get(url) # TODO: streaming
            # see https://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
        if auth or 403 == r.status_code:
            keyFnp = os.path.expanduser('~/.encode.txt')
            if os.path.exists(keyFnp):
                with open(keyFnp) as f:
                    toks = f.read().strip().split('\n')
                r = requests.get(url, auth=HTTPBasicAuth(toks[0], toks[1])) # TODO streaming
            else:
                raise Exception("no ENCODE password file found at: " +
                                keyFnp)
        if 200 != r.status_code:
            Utils.quietPrint(quiet, "could not download", url)
            Utils.quietPrint(quiet, "status_code:", r.status_code)
            return False

        # with open(fnpTmp, "wb") as f:
        try:
            fnpTmp = None
            with tempfile.NamedTemporaryFile("wb", delete=False) as f:
                f.write(r.content)
                fnpTmp = f.name
            shutil.move(fnpTmp, fnp)
        except:
            raise
        finally:
            if fnpTmp and os.path.exists(fnpTmp):
                os.remove(fnpTmp)
        # chmod g+w
        st = os.stat(fnp)
        os.chmod(fnp, st.st_mode | umask)
        return True
예제 #14
0
 def getlog(self):
     global ocram
     try:
         sourcefile = 'https://www.openvix.co.uk/feeds/%s/%s/%s-git.log' % (
             getImageDistro(), getImageVersion(), self.logtype)
         sourcefile, headers = urllib.urlretrieve(sourcefile)
         rename(sourcefile, '/tmp/' + self.logtype + '-git.log')
         fd = open('/tmp/' + self.logtype + '-git.log', 'r')
         releasenotes = fd.read()
         fd.close()
     except:
         releasenotes = '404 Not Found'
     if '404 Not Found' not in releasenotes:
         releasenotes = releasenotes.replace('[openvix] Zeus Release.',
                                             'openvix: build 000')
         releasenotes = releasenotes.replace('\nopenvix: build',
                                             "\n\nopenvix: build")
         releasenotes = releasenotes.split('\n\n')
         ver = -1
         releasever = ""
         viewrelease = ""
         while not releasever.isdigit():
             ver += 1
             releasever = releasenotes[int(ver)].split('\n')
             releasever = releasever[0].split(' ')
             if len(releasever) > 2:
                 releasever = releasever[2].replace(':', "")
             else:
                 releasever = releasever[0].replace(':', "")
         if self.logtype == 'oe':
             if int(getImageBuild()) == 1:
                 imagever = int(getImageBuild()) - 1
             else:
                 imagever = int(getImageBuild())
         else:
             imagever = int(getImageBuild()) + 905
         while int(releasever) > int(imagever):
             if ocram:
                 viewrelease += releasenotes[int(ver)] + '\n' + ocram + '\n'
                 ocram = ""
             else:
                 viewrelease += releasenotes[int(ver)] + '\n\n'
             ver += 1
             releasever = releasenotes[int(ver)].split('\n')
             releasever = releasever[0].split(' ')
             releasever = releasever[2].replace(':', "")
         if not viewrelease and ocram:
             viewrelease = ocram
             ocram = ""
         self["text"].setText(viewrelease)
         summarytext = viewrelease.split(':\n')
         try:
             self['title_summary'].setText(summarytext[0] + ':')
             self['text_summary'].setText(summarytext[1])
         except:
             self['title_summary'].setText("")
             self['text_summary'].setText(viewrelease)
     else:
         self['title_summary'].setText("")
         self['text_summary'].setText(_("Error downloading change log."))
         self['text'].setText(_("Error downloading change log."))