示例#1
0
文件: dataset.py 项目: icostan/kaggle
def load_train_data(categories, size=SIZE, localization=True, verbose=False):
    x = []
    y = []
    log('Status', 'Processing... ' + str(categories))
    for t in categories:
        folder = TRAIN_FOLDER + t
        files = os.listdir(folder)
        log(t, len(files), suffix='files')
        for filename in files:
            img = load_image(folder + '/' + filename,
                             size=size,
                             expand_dims=False)
            x.append(img)
            y.append(t)
    log('Status', 'DONE')

    X = normalize(np.array(x))
    log('X shape', X.shape)
    log('X size', bytesto(X.nbytes, 'm'), suffix='MB')

    Y = preprocessing.LabelEncoder().fit_transform(np.array(y))
    log('Y shape', Y.shape)
    log('Y size', bytesto(Y.nbytes, 'k'), suffix='KB')

    return X, Y
示例#2
0
文件: dataset.py 项目: icostan/kaggle
def load_regression_data(categories, size=SIZE, verbose=False):
    x = []
    y = []
    log('Status', 'Processing... ' + str(categories))
    for t in categories:
        folder = TRAIN_FOLDER + t
        files = os.listdir(folder)
        log(t, len(files), suffix='files')
        for filename in files:
            img = load_image(folder + '/' + filename,
                             size=size,
                             expand_dims=False)
            a = annotations.for_image(filename, t)
            if a != None:
                x.append(img)
                y.append([a['x'], a['y']])
    log('Status', 'DONE')

    X = normalize(np.array(x))
    log('X shape', X.shape)
    log('X size', bytesto(X.nbytes, 'm'), suffix='MB')

    Y = np.array(y)
    log('Y shape', Y.shape)
    log('Y size', bytesto(Y.nbytes, 'k'), suffix='KB')

    return X, Y
示例#3
0
def get_db_size(human=False):
  if human:
    size = db_common.search(Common.data == "db_size")
    size = sorted(size, key=lambda k: k['db_size_timestamp'])
    return bytesto(size[-1].get("value") if size else 0, "m")
  else:
    size = db_common.search(Common.data == "db_size")
    return size[-1].get("value") if size else 0
示例#4
0
def set_db_size():
  if os.path.isfile(MAIN_DB):
    size = os.path.getsize(MAIN_DB)
    db_common.insert({"data": "db_size", "value": size, "value_mb": bytesto(size, "m"), "db_size_timestamp": int(time.time()), "timestamp": int(time.time())})
    hours_ago_24 = int((int(time.time()) - DAY))
    db_common.remove(Common.db_size_timestamp < hours_ago_24)
  else:
    log.info('no database found in: {}'.format(MAIN_DB))
示例#5
0
文件: dataset.py 项目: icostan/kaggle
def load_test_data(folder):
    t = []
    files = os.listdir(folder)
    log('Status', 'Processing... ' + str(len(files)) + ' files')
    for filename in files:
        path = folder + '/' + filename
        img = load_image(path, expand_dims=False)
        t.append(img)
    log('Status', 'DONE')
    T = normalize(np.array(t))
    log('Shape', T.shape)
    log('Size', bytesto(T.nbytes, 'm'), suffix='MB')
    return T, files
def learn(subreddit=None):
    log.info("trying to learn")

    if os.path.isfile(MAIN_DB):
        size = os.path.getsize(MAIN_DB)
        log.info("db size: " + str(bytesto(size, "m")))
    else:
        size = 0

    if size > MAIN_DB_MAX_SIZE:  # learn faster early on
        log.info("DB size has reached limit: {}".format(
            bytesto(MAIN_DB_MAX_SIZE, "m")))
        return

    try:
        if subreddit:  # learning from a supplied
            if isinstance(subreddit, str):
                log.info("learning from: " + subreddit)
                sub = reddit.api.subreddit(subreddit)
            else:
                log.info(
                    'I have no idea what subreddit you gave me. not going to learn.'
                )
                return
        elif SUBREDDIT_LIST:  # learn from one of the filtered subreddits
            sub_name = random.choice(SUBREDDIT_LIST)
            log.info("SUBREDDIT_LIST is active")
            log.info("learning from: {}".format(sub_name))
            sub = reddit.api.subreddit(sub_name)
        else:  # no subreddit supplied, so we learn from a random one
            subok = False
            while subok == False:
                sub = reddit.api.subreddit("random")
                if sub.over18 == False:  # we don't want nsfw sub
                    if (sub.subscribers > 100000
                        ):  # easier to get away with stuff on big subs
                        log.info("checking if subreddit is blacklisted")
                        with open('src/banned_subreddits.txt'
                                  ) as blacklisted_subreddits:
                            list_of_blacklisted_subreddits = blacklisted_subreddits.read(
                            ).splitlines()
                            if (str(sub)
                                ) not in list_of_blacklisted_subreddits:
                                log.info("not blacklisted")
                                log.info("found: " + str(sub.display_name))
                                subok = True
                            else:
                                log.info("r/" + str(sub) + " is blacklisted")

        sub_db = "{}/{}.db".format(DB_DIR, str(sub.display_name))
        log.info("active db : {}".format(sub_db))
        sub_brain = Brain(sub_db)

        sub_hot = sub.hot()

        log.info("looping through submissions")

        # Loop through each submission
        for submission in sub_hot:
            log.info("checking submission")
            # Replace the "MoreReplies" with all of the submission replies
            submission.comments.replace_more(limit=0)

            # Get a list of all the comments flattened
            comments = submission.comments.list()

            log.debug("looping through comments")
            # Loop through each comment from the submission
            for comment in comments:
                if len(comment.body
                       ) < 240:  # long text tends to turn into noise.
                    log.debug("comment score: {}".format(comment.score))
                    if (comment.score > 20
                        ):  # We only want to learn things people like to hear.
                        if (comment.author != submission.author
                            ):  # We only want to learn comments as an ovserver
                            comment_body = comment.body
                            for dis_word in DISALLOWED_WORDS:
                                if dis_word in comment_body:
                                    comment_body.replace(dis_word, "")
                            if LOG_LEARNED_COMMENTS:
                                log.info(
                                    "learning comment. score: {}; comment: {}".
                                    format(comment.score,
                                           comment_body.encode("utf8")))
                            base_brain.learn(comment_body.encode(
                                "utf8"))  # Tell the bot to learn this comment
                            sub_brain.learn(comment_body.encode(
                                "utf8"))  # Tell the bot to learn this comment
                            # else:
                            #     log.info("Did not learn form this comment as it contained a word from the dissallowed word list.")
        log.info("done learning")
    except Exception as e:
        # If any errors occur just print it to the console
        log.info(e, exc_info=True)
示例#7
0
	def download_photos(self):
		
		total_photos_from_neighborhood = 0
		total_size_downloaded_from_neighborhood = 0

		page = 1

		while (True):
			
			print "\n-> PAGINA: %s" % (str(page))

			total_photos_in_page = 0
			total_size_downloaded_in_page = 0

			url_params = {'idciudad': Constants.CITY_ID_TO_DOWNLOAD, 'idzona': Constants.NEIGHBORHOOD_ID_TO_DOWNLOAD, 'pagina': page}
			request_response = requests.get(Constants.IMOVELWEB_API_QUERY_URL, params=url_params, headers=Constants.DEFAULT_IMOVELWEB_API_HEADER)
			json_response = request_response.json()
			
			ads = json_response['data']['avisos']

			for ad in ads:

				number_of_photos_from_ad = 0
				total_size_downloaded_from_ad = 0

				for attribute, value in ad.iteritems():
					
					if attribute=='fotos':

						for photo in value:
							photo_url = photo.get('url').replace("580x465", Constants.PHOTO_SIZE_TO_DOWNLOAD); # getting smaller images

							local_dir_saved_photo = Constants.LOCAL_DIR_SAVE_PHOTO + "/" +  str(ad.get('idpropiedad'))

							local_path_to_save_photo = local_dir_saved_photo + "/" + photo_url.rpartition('/')[2]

							if not os.path.exists(local_dir_saved_photo): 
								os.mkdir(local_dir_saved_photo) #creating aviso directory to save photo

							urllib.urlretrieve(photo_url, local_path_to_save_photo) #downloading photo
							size = format(utils.bytesto(os.path.getsize(local_path_to_save_photo), 'm'),'.4f') #size in megabytes of each photo

							number_of_photos_from_ad += 1
							total_size_downloaded_from_ad = total_size_downloaded_from_ad + float(size)

				print "[#AD] Photos from:  %s: %s | Tamanho:%s\n" % (ad.get('idpropiedad'),str(number_of_photos_from_ad),str(total_size_downloaded_from_ad))
				
				total_photos_in_page = total_photos_in_page + number_of_photos_from_ad
				total_size_downloaded_in_page = total_size_downloaded_in_page + total_size_downloaded_from_ad
			
				print "[#PAGE] Photos from page %s: %s | Tamanho:%s\n" % (page,str(total_photos_in_page),str(total_size_downloaded_in_page))

				total_photos_from_neighborhood = total_photos_from_neighborhood + total_photos_in_page
				total_size_downloaded_from_neighborhood = total_size_downloaded_from_neighborhood + total_size_downloaded_in_page

				#checking if it's the end of the listings
				if len(ads)<Constants.NUMBER_OF_ADS_RETURNED_API:
					break       

			page += 1

		print "[#TOTAL] Photos from all pages: %s | Tamanho:%s\n" % (str(total_photos_from_neighborhood),str(total_size_downloaded_from_neighborhood))
示例#8
0
from logger import log
from requests import get

try:
    ip = get('https://api.ipify.org').text
    print 'My public IP address is:', ip
except Exception as e:
    print "counld not check external ip"

limit = 52428800
log.info('------------new bot run--------------')
log.info("user is " + str(reddit.api.user.me()))

if __name__ == '__main__':

    log.info('db size size to start replying:' + str(bytesto(limit, 'm')))
    while True:

        if os.path.isfile(MAIN_DB):
            size = os.path.getsize(MAIN_DB)
            log.info('db size: ' + str(bytesto(size, 'm')))
        else:
            size = 0

        if size < limit:  # learn faster early on
            log.info('fast learning')
            learn()
            try:
                log.info('new db size: ' +
                         str(bytesto(os.path.getsize(MAIN_DB), 'm')))
            except:
示例#9
0
def learn(subreddit=None):
    log.info("trying to learn")

    if os.path.isfile(MAIN_DB):
        size = os.path.getsize(MAIN_DB)
        log.info("db size: " + str(bytesto(size, "m")))
    else:
        size = 0

    if size > MAIN_DB_MAX_SIZE:  # learn faster early on
        log.info("DB size has reached limit: {}".format(
            bytesto(MAIN_DB_MAX_SIZE, "m")))
        return

    try:
        if subreddit:
            log.info("learning from: " + subreddit)
            sub = reddit.api.subreddit(subreddit)
        else:  # no subreddit supplied, so we learn from a random one
            subok = False
            while subok == False:
                sub = reddit.api.subreddit("random")
                if sub.over18 == False:  # we don't want nsfw sub
                    if (sub.subscribers > 100000
                        ):  # easier to get away with stuff on big subs
                        log.info("found: " + str(sub.display_name))
                        subok = True

        sub_db = "{}/{}.db".format(DB_DIR, str(sub.display_name))
        log.info("active db : {}".format(sub_db))
        sub_brain = Brain(sub_db)

        sub_hot = sub.hot()

        log.info("looping through submissions")

        # Loop through each submission
        for submission in sub_hot:
            log.info("checking submission")
            # Replace the "MoreReplies" with all of the submission replies
            submission.comments.replace_more(limit=0)

            # Get a list of all the comments flattened
            comments = submission.comments.list()

            log.debug("looping through comments")
            # Loop through each comment from the submission
            for comment in comments:
                if len(comment.body
                       ) < 240:  # long text tends to turn into noise.
                    log.debug("comment score: {}".format(comment.score))
                    if (comment.score > 20
                        ):  # We only want to learn things people like to hear.
                        if (comment.author != submission.author
                            ):  # We only want to learn comments as an ovserver
                            log.info(
                                "learning comment. score: {}; comment: {}".
                                format(comment.score,
                                       comment.body.encode("utf8")))
                            base_brain.learn(comment.body.encode(
                                "utf8"))  # Tell the bot to learn this comment
                            sub_brain.learn(comment.body.encode(
                                "utf8"))  # Tell the bot to learn this comment
        log.info("done learning")
    except Exception as e:
        # If any errors occur just print it to the console
        log.info(e, exc_info=True)
示例#10
0
def init():
    log.info("db size size to start replying:" +
             str(bytesto(MAIN_DB_MIN_SIZE, "m")))
    reddit.shadow_check()
    # check if this is the first time running the bot
    set_user_info()
    check_first_run()
    set_db_size()
    while True:
        if get_db_size(
        ) < MAIN_DB_MIN_SIZE and not COMMENTS_DISABLED:  # learn faster early on
            log.info("""
          THE BOT IS WORKING. IT WILL TAKE ABOUT 8 HOURS FOR IT TO LEARN AND START COMMENTING.
          """)
            log.info("fast learning")
            learn()
            try:
                log.info("new db size: " + str(bytesto(get_db_size(), "m")))
            except:
                pass
            set_db_size()
            countdown(2)

        if (get_db_size() > MAIN_DB_MIN_SIZE or COMMENTS_DISABLED
            ):  # once we learn enough start submissions and replies
            log.info("database size is big enough")

            if USE_SLEEP_SCHEDULE:
                while should_we_sleep():
                    log.info("zzzzzzzz :snore:")
                    time.sleep(60)

            for action in reddit_bot:
                if action.rate_limit_unlock_epoch != 0:
                    if action.rate_limit_unlock_epoch > get_current_epoch():
                        log.info(
                            "{} hit RateLimit recently we need to wait {} seconds with this"
                            .format(
                                action.name,
                                action.rate_limit_unlock_epoch -
                                get_current_epoch(),
                            ))
                        continue
                    else:
                        action._replace(rate_limit_unlock_epoch=0)
                else:
                    if prob(action.probability):
                        log.info("making a random {}".format(action.name))
                        try:
                            action.action()
                        except praw.exceptions.APIException as e:
                            secs_to_wait = get_seconds_to_wait(str(e))
                            action._replace(
                                rate_limit_unlock_epoch=(get_current_epoch() +
                                                         secs_to_wait))
                            log.info(
                                "{} hit RateLimit, need to sleep for {} seconds"
                                .format(action.name, secs_to_wait))
                        except Exception as e:
                            log.error("something weird happened, {}".format(e),
                                      exc_info=True)
            if prob(PROBABILITIES["LEARN"]):  # chance we'll learn more
                log.info("going to learn")
                learn()

            # Wait 10 minutes to comment and post because of reddit rate limits
            countdown(1)
        log.info("end main loop")
示例#11
0
    log.error("counld not check external ip")

RATE_LIMIT = 0
NEED_TO_WAIT = 0
log.info("------------new bot run--------------")
log.info("user is " + str(reddit.api.user.me()))

reddit_bot = [
    reddit_bot_action("reply", reddit.random_reply, PROBABILITIES["REPLY"], 0),
    reddit_bot_action("delete", reddit.delete_comments,
                      PROBABILITIES["DELETE"], 0),
]

if __name__ == "__main__":
    log.info("db size size to start replying:" +
             str(bytesto(MAIN_DB_MIN_SIZE, "m")))
    while True:

        if os.path.isfile(MAIN_DB):
            size = os.path.getsize(MAIN_DB)
            log.info("db size: " + str(bytesto(size, "m")))
        else:
            size = 0

        if size < MAIN_DB_MIN_SIZE:  # learn faster early on
            log.info("fast learning")
            learn()
            try:
                log.info("new db size: " +
                         str(bytesto(os.path.getsize(MAIN_DB), "m")))
            except: