Exemplo n.º 1
0
 def __init__(self, GPU=True):
     self.conn = psycopg2.connect(dbname="dogs2", user="******")
     self.psql = self.conn.cursor()
     self.features = []
     self.GPU = GPU
     self.binar = BinarizeArray()
     self.binar.run()
Exemplo n.º 2
0
class DoggyMatchEngine(object):
    def __init__(self, GPU=True):
        self.conn = psycopg2.connect(dbname="dogs2", user="******")
        self.psql = self.conn.cursor()
        self.features = []
        self.GPU = GPU
        self.binar = BinarizeArray()
        self.binar.run()

    def fit(self, user_input, session_id):
        self.session_id = session_id
        if "http" in user_input or "https" in user_input:
            r = requests.get(user_input, stream=True)
            if r.status_code == 200:
                rand_name = "app/static/uploads/img_" + session_id + ".jpg"
                with open(rand_name, "wb") as f:
                    r.raw.decode_content = True
                    shutil.copyfileobj(r.raw, f)
                    img_file = rand_name
        else:
            img_file = user_input

        img = imread(img_file)
        i_size = img.shape
        if i_size[0] < 221 or i_size[1] < 221 or (i_size[2] != 3 and i_size[2] != 4):
            raise Exception("Img too small to process.")
        imgr = resize(img, [221, 221])
        self.orig_file = img_file
        img_file = img_file.replace("app/static/uploads/", "app/static/uploads/r_")
        imsave(img_file, imgr)
        if self.GPU:
            p = subprocess.check_output(
                [
                    "/OverFeat/bin/linux_64/cuda/\
                                        overfeat_cuda",
                    "-f",
                    "-l",
                    img_file,
                ]
            )
        else:
            p = subprocess.check_output(["/OverFeat/bin/linux_64/overfeat", "-f", "-l", img_file])
        features = p.replace("\n", "").replace("1-", "").strip(" ").split(" ")[2:]
        features = [float(x) for x in features]
        if len(features) != 4096:
            raise Exception("Feature size is off.")
        self.user_features = features

    def sql_import(self, user_zip, user_radius):
        self.psql.execute(
            """select lng,lat from zipcodes where zip=%s limit 1;
            """
            % user_zip
        )
        lnglat = self.psql.fetchall()
        user_long = lnglat[0][0]
        user_lat = lnglat[0][1]
        self.psql.execute(
            """select id,features_bin from records where ST_Distance_Sphere
            (geom, ST_MakePoint(%s, %s))*0.000621371 < %s;
            """
            % (user_long, user_lat, user_radius)
        )
        filtered = self.psql.fetchall()
        self.geo_ids_lst = [x[0] for x in filtered]
        self.geo_feature_lst = [x[1] for x in filtered]

    def match(self, num_matches=10):
        sim_scores = pairwise_distances(
            np.array(self.user_features), np.array(self.geo_feature_lst), "cosine", n_jobs=-1
        )
        sim_scores = np.array(sim_scores).flatten()
        ids = np.array(self.geo_ids_lst)
        match_inds = sim_scores.argsort()
        min_arr = match_inds[:num_matches]
        self.top10_scores = sim_scores[min_arr]
        self.top10_ids = ids[min_arr]
        ids_subset = [re.sub("_.*", "", x) for x in self.top10_ids]
        c_ids = Counter(ids_subset)

        if c_ids.most_common()[0][1] > 1:
            dupes = True
        else:
            dupes = False

        data_in = num_matches
        while dupes:
            arr_ids = np.array(ids_subset)
            repeats = np.where(arr_ids == c_ids.most_common()[0][0])
            dup_ind = repeats[0][1]
            new_top_ids = np.delete(self.top10_ids, dup_ind)
            new_top_scores = np.delete(self.top10_scores, dup_ind)
            repl_match_ind = match_inds[data_in]
            self.top10_scores = np.append(new_top_scores, sim_scores[repl_match_ind])
            self.top10_ids = np.append(new_top_ids, ids[repl_match_ind])
            ids_subset = [re.sub("_.*", "", x) for x in self.top10_ids]
            c_ids = Counter(ids_subset)
            if c_ids.most_common()[0][1] > 1:
                data_in += 1
            else:
                dupes = False

    def results(self):
        conn = boto.connect_s3(os.environ["access_key"], os.environ["access_secret_key"])
        bucket_name = os.environ["bucket_name"]
        bucket = conn.get_bucket(bucket_name)

        img_data = []
        for x, val in enumerate(self.top10_ids):
            self.psql.execute(
                """select name,gender,age,city,state,
                zip from records where id='%s';"""
                % val
            )
            img_details = self.psql.fetchall()[0]
            key = val.encode("ascii", "ignore") + ".jpg"
            img = bucket.get_key(key)
            outfile = img.generate_url(6000)
            img_data.append(
                [
                    val,
                    outfile,
                    self.top10_scores[x],
                    img_details[0],
                    img_details[1],
                    img_details[2],
                    img_details[3],
                    img_details[4],
                    img_details[5],
                ]
            )
        self.df = pd.DataFrame(
            np.array(img_data), columns=["id", "img_loc", "score", "name", "gender", "age", "city", "state", "zip"]
        )
        self.df.to_csv("tmp/" + self.session_id + ".csv")