def __update_given_submissions(self, submissions): for submission in submissions: submission_id = base36decode(submission.id) if not self.db_manager.row_exists(submission_id): s = Submission.from_reddit_api(submission) self.db_manager.insert_submission(s) else: # if submission does exist, update its score new_submission = Submission.from_reddit_api(submission) # existing_submission = query = 'SELECT * FROM submissions WHERE id = %d;' % submission_id existing_submissions = [Submission(x) for x in self.db_manager.query(query)] assert(len(existing_submissions) == 1) existing_submission = existing_submissions[0] #print "existing score: ", existing_submission.score #print "new score: ", new_submission.score if existing_submission.score != new_submission.score: #print "going to replace old submission!" #print "NEW_SUBMISSION: ", new_submission.to_tuple() #print "BEFORE: ", existing_submission.to_tuple() existing_submission.score = new_submission.score #print "AFTER: ", existing_submission.to_tuple() #time.sleep(5) self.db_manager.replace_submission(existing_submission) # Debug printing: if self.db_manager.new_rows_written % 100 == 0 and self.db_manager.new_rows_written != 0: print "Entries written so far: ", self.db_manager.new_rows_written, " [", self.db_manager.rows_written, " total]"
def main(): query = 'SELECT * FROM submissions WHERE ' \ 'gender IS NOT NULL and ' \ 'height_in IS NOT NULL and previous_weight_lbs IS NOT NULL and current_weight_lbs IS NOT NULL and media_json IS NOT NULL;' m = DatabaseManager(DATABASE_PATH) submissions = [Submission(x) for x in m.query(query)] json_dump_str = Submission.submission_list_to_json(submissions) f = open('json_dump.json', 'w') f.write(json_dump_str) print "Wrote ", len(submissions), "to disk."
def __update_given_submission(self, submission): submission_id = base36decode(submission.id) if not self.db_manager.row_exists(submission_id): s = Submission.from_reddit_api(submission) self.db_manager.insert_submission(s) else: new_submission = Submission.from_reddit_api(submission) query = 'SELECT * FROM submissions WHERE id = %d;' % submission_id existing_submissions = [Submission(x) for x in self.db_manager.query(query)] assert(len(existing_submissions) == 1) existing_submission = existing_submissions[0] if existing_submission.score != new_submission.score: existing_submission.score = new_submission.score self.db_manager.replace_submission(existing_submission)
def main(): """ query = 'SELECT * FROM submissions WHERE manually_marked = 0 and ' \ 'manually_verified = 0 and gender IS NOT NULL and age IS NOT NULL and ' \ 'height_in IS NOT NULL and current_weight_lbs IS NOT NULL;' """ query = 'SELECT * FROM submissions WHERE media_json IS NULL;' # query = 'SELECT * FROM submissions;' m = DatabaseManager(DATABASE_PATH) submissions = [Submission(x) for x in m.query(query)] #count = 0 #max_count = 50 for submission in submissions: #submission.media_json = None #submission.media_embed_json = None print "ID: ", submission.id print "Title: ", submission.title print "Selftext: ", submission.self_text print "URL: ", submission.url # print json_str try: Imgur.load_imgur_information_for_submission(submission) except: continue print "------------------------------------------------------" print "Media JSON: ", submission.media_json print "--------------------------------------------------------------------------------" m.replace_submission(submission)
def main(): """ query = 'SELECT * FROM submissions WHERE manually_marked = 0 and ' \ 'manually_verified = 0 and gender IS NOT NULL and age IS NOT NULL and ' \ 'height_in IS NOT NULL and current_weight_lbs IS NOT NULL;' """ submission_id = raw_input("Enter submission id to clear analysis: ") query = 'SELECT * FROM submissions WHERE id = "%s";' % submission_id # query = 'SELECT * FROM submissions;' m = DatabaseManager(DATABASE_PATH) submissions = [Submission(x) for x in m.query(query)] #count = 0 #max_count = 50 assert (len(submissions) == 1) for submission in submissions: #submission.media_json = None #submission.media_embed_json = None print "Title: ", submission.title print "Selftext: ", submission.self_text print "URL: ", submission.url print "Media JSON: ", submission.media_json # print json_str print "--------------------------------------------------------------------------------" submission.previous_weight_lbs = None submission.current_weight_lbs = None submission.age = None submission.height_in = None m.replace_submission(submission)
def __update_given_submissions(self, submissions): for submission in submissions: if not self.image_manager.row_exists(submission.id): s = Submission.from_reddit_api(submission) self.image_manager.insert_submission(s) else: # if submission does exist, update its score new_submission = Submission.from_reddit_api(submission) # existing_submission = query = 'SELECT * FROM submissions WHERE id = "%s";' % submission.id existing_submissions = [Submission(x) for x in m.query(query)] assert (len(existing_submissions) == 1) existing_submission = existing_submissions[0] if existing_submission.score != new_submission.score: #print "NEW_SUBMISSION: ", new_submission.to_tuple() #print "BEFORE: ", existing_submission.to_tuple() existing_submission.score = new_submission.score #print "AFTER: ", existing_submission.to_tuple() #time.sleep(5) m.replace_submission(existing_submission)
def main(): query = 'SELECT * FROM submissions WHERE manually_marked = 0 and ' \ 'manually_verified = 0 and gender IS NOT NULL and age IS NOT NULL and ' \ 'height_in IS NOT NULL and previous_weight_lbs IS NOT NULL and ' \ 'current_weight_lbs IS NOT NULL;' m = DatabaseManager(DATABASE_PATH) submissions = [Submission(x) for x in m.query(query)] for submission in submissions: print submission.id print "TITLE:", submission.title print "CLASSIFICATION: " print "gender: ", submission.gender, "age: ", submission.age, "height_in: ", submission.height_in, "previous_weight: ", submission.previous_weight_lbs, "current_weight: ", submission.current_weight_lbs """ submission.gender = r.gender_is_female submission.age = r.age submission.height_in = r.height_in submission.previous_weight_lbs = r.previous_weight submission.current_weight_lbs = r.current_weight """ print "----------------------------------------------------------------------------------------"
def run_test(): # We primarily want to classify pictures which have associated media, but do not have a classification # We really don't give a shit about any of the other submissions. submission_id = "1hncxw" query = 'SELECT * FROM submissions WHERE id="%s"' % submission_id m = DatabaseManager(DATABASE_PATH) # TODO: get all unique user names # for each user: # look up their submitted posts in the following subreddits # return [Submission(x) for x in c.fetchall()] all_matches = [] submissions = [Submission(x) for x in m.query(query)] assert (len(submissions) == 1) submission = submissions[0] # M/28/5'7" Day 1, goal is to look as great as I feel! # "[MF]/\d+/\d+'\d+" r = RedditAnalyzer(submission.title, submission.self_text) # Start Print statements print "Title: ", submission.title print "Self text: ", submission.self_text # submission.manually_verified #exit() print # Later, we can work on the selftext #text = nltk.word_tokenize("And now for something completely different") #text2 = nltk.word_tokenize(submission.title) #print nltk.pos_tag(text2) print t.bold(t.red("CLASSIFICATION: " + r.get_debug_str())) print t.bold( t.red("LOW CONFIDENCE CLASSIFICATION: " + r.get_lc_debug_str())) print t.bold( t.green("Potential weights:" + ','.join(str(x) for x in r.potential_weights)))
def main(): """ query = 'SELECT * FROM submissions WHERE manually_marked = 0 and ' \ 'manually_verified = 0 and gender IS NOT NULL and age IS NOT NULL and ' \ 'height_in IS NOT NULL and current_weight_lbs IS NOT NULL;' """ query = 'SELECT * FROM submissions WHERE manually_marked = 0 and manually_verified = 0 and media_json NOT NULL;' # query = 'SELECT * FROM submissions;' m = DatabaseManager(DATABASE_PATH) submissions = [Submission(x) for x in m.query(query)] #count = 0 #max_count = 50 for submission in submissions: # submission.media_json = None # submission.media_embed_json = None #Imgur.load_imgur_information_for_submission(submission) submission.gender = None submission.age = None submission.current_weight_lbs = None submission.previous_weight_lbs = None submission.height_in = None m.replace_submission(submission)
def update_all_reddits(self, smallest_id, largest_id): # http://www.reddit.com/dev/api # t3_ means link # example: # http://www.reddit.com/by_id/t3_zcd40t3_zcd41,t3_zcd42,t3_zcd43/.json print "Attempting to download all reddit submissions between id: ", smallest_id, " and ", largest_id i = 0 entries_written = 0 entries_non_existent = 0 while smallest_id < largest_id: i += 1 # Debug printing every 50 runs (after processing 5000 entries) ~ every 100 seconds: if (i - 1) % 50 == 0: #if self.db_manager.new_rows_written % 1000 == 0 and self.db_manager.new_rows_written != 0: print "Entries written: ", entries_written, " [Non-existent: ", entries_non_existent, "] - on id: ", smallest_id url = "http://www.reddit.com/by_id/" submissions_to_fetch_int = set() # Queue up 100 submissions to fetch which the database does not currently contain while smallest_id < largest_id and len(submissions_to_fetch_int) < ENTRIES_TO_FETCH: # print "row_id = ", most_recent_id if not self.db_manager.row_exists(smallest_id): submissions_to_fetch_int.add(smallest_id) smallest_id += 1 # Create a URL string for the query submissions_to_fetch_str = [] for s in submissions_to_fetch_int: submissions_to_fetch_str.append("t3_" + base36encode(s)) url += ','.join(submissions_to_fetch_str) # Query for the submissions submissions = None try: submissions = self.r.request_json(url, params={'limit': 100}, data=None, as_objects=True, retry_on_error=True) # print submissions submissions = submissions['data']['children'] except: print "Error when trying to fetch url: ", url submissions_fetched_int = set() if submissions: for submission in submissions: self.__update_given_submission(submission) entries_written += 1 submission_id = base36decode(submission.id) submissions_fetched_int.add(submission_id) # subtract submissions_fetched_int from submissions_to_fetch_int submissions_not_fetched = submissions_to_fetch_int.difference(submissions_fetched_int) for submission_id in submissions_not_fetched: # Mark nonexistent entries if not self.db_manager.row_exists(submission_id): non_existent_entry = Submission.non_existent_submission(submission_id) self.db_manager.insert_submission(non_existent_entry) entries_non_existent += 1
def main(): time_taken = 0 query = 'SELECT * FROM submissions WHERE manually_marked = 0 and manually_verified = 0 and media_json NOT NULL ORDER BY score DESC;' m = DatabaseManager(DATABASE_PATH) # TODO: get all unique user names # for each user: # look up their submitted posts in the following subreddits # return [Submission(x) for x in c.fetchall()] all_matches = [] submissions = [Submission(x) for x in m.query(query)] classifications = 0 total = 0 weight_and_height = 0 atleast_height = 0 previous_stats = "" global_start_time = time.time() entries_processed = 0 for submission in submissions: #submission.media_json = None #submission.media_embed_json = None r = RedditAnalyzer(submission.title, submission.self_text) if r.has_gender() and r.has_height() and not r.has_current_weight(): local_start_time = time.time() entries_processed += 1 # TODO submission.manually_marked = 1 print "ID: ", submission.id print "Title: ", t.bold(submission.title) print "Self text: ", submission.self_text print "URL: ", submission.url print "Score: ", submission.score print t.bold(t.red("CLASSIFICATION: " + r.get_debug_str())) print t.bold(t.red("LOW CONFIDENCE CLASSIFICATION: " + r.get_lc_debug_str())) # print t.bold(t.green("Potential weights:" + ','.join(str(x) for x in r.potential_weights))) print t.bold(t.green(previous_stats)) print "NOTE: If current weight is skipped, nothing will be saved." previous_weight = raw_input('Enter previous weight: ') if previous_weight == "XXX": # This means this is a bad entry and does not have the # adequate weight data submission.manually_marked = 1 submission.manually_verified = 1 m.replace_submission(submission) continue current_weight = raw_input('Enter current weight: ') if previous_weight: print "Entered previous weight of: ", previous_weight if current_weight: print "Entered current weight of: ", current_weight if current_weight: submission.current_weight_lbs = int(current_weight) if previous_weight: submission.previous_weight_lbs = int(previous_weight) # We know that one of the low confidence or regual is set, # so we know the below two if statements will be successful # in setting the values if r.gender_is_female is None: r.gender_is_female = r.lc_gender_is_female if not r.height_in: r.height_in = r.lc_height_in submission.gender = r.gender_is_female submission.height_in = r.height_in if r.age: submission.age = r.age # submission.manually_verified submission.manually_marked = 1 submission.manually_verified = 1 assert(verify_submission_meets_criteria(submission)) m.replace_submission(submission) # entries_processed local_end_time = time.time() previous_stats = "\n" previous_stats += "Entry took " + str(round((local_end_time - local_start_time), 2)) + " seconds.\n" previous_stats += "Rate: " + str(round((3600 / (local_end_time - local_start_time)), 2)) + " entries / hr.\n" entries_per_second_so_far = entries_processed / (local_end_time - global_start_time) previous_stats += "Ongoing Rate: " + str(round(entries_per_second_so_far * 3600, 2)) + " entries / hr." print previous_stats print print "---------------------------------------------------------------------" print title
def analyze_all_progress_pics(): # We primarily want to classify pictures which have associated media, but do not have a classification # We really don't give a shit about any of the other submissions. query = 'SELECT * FROM submissions WHERE manually_marked = 0 and manually_verified = 0 and media_json NOT NULL;' m = DatabaseManager(DATABASE_PATH) # TODO: get all unique user names # for each user: # look up their submitted posts in the following subreddits # return [Submission(x) for x in c.fetchall()] all_matches = [] submissions = [Submission(x) for x in m.query(query)] classifications = 0 total = 0 weight_and_height = 0 atleast_height = 0 for submission in submissions: total += 1 # M/28/5'7" Day 1, goal is to look as great as I feel! # "[MF]/\d+/\d+'\d+" r = RedditAnalyzer(submission.title, submission.self_text) # print "BEFORE: ", submission.to_tuple() # the if statement below is what makes it primary work for progress pics if r.has_gender() and r.has_height() and r.has_current_weight(): #pass #print "CLASSIFICATION: ", r.get_debug_str() #print "BEFORE: ", submission.to_tuple() # Either the value or the lc_value (low confidence) # value will be set. We want to set the submission with # one of those, preferably the non-lc version # Gender if r.gender_is_female is not None: submission.gender = r.gender_is_female else: submission.gender = r.lc_gender_is_female # Height if r.height_in is not None: submission.height_in = r.height_in else: submission.height_in = r.lc_height_in # Current Weight if r.current_weight is not None: submission.current_weight_lbs = r.current_weight else: submission.current_weight_lbs = r.lc_current_weight # Previous Weight if r.previous_weight is not None: submission.previous_weight_lbs = r.previous_weight else: submission.previous_weight_lbs = r.lc_previous_weight m.replace_submission(submission) # print "AFTER: ", submission.to_tuple() classifications += 1 if r.has_current_weight() and r.has_height() and r.get_debug_str(): weight_and_height += 1 if r.has_height(): atleast_height += 1 # Start Print statements if r.has_gender() and r.has_height() and not r.has_current_weight(): print "Title: ", submission.title print "Self text: ", submission.self_text # submission.manually_verified #exit() print # Later, we can work on the selftext #text = nltk.word_tokenize("And now for something completely different") #text2 = nltk.word_tokenize(submission.title) #print nltk.pos_tag(text2) print t.bold(t.red("CLASSIFICATION: " + r.get_debug_str())) print t.bold( t.red("LOW CONFIDENCE CLASSIFICATION: " + r.get_lc_debug_str())) print t.bold( t.green("Potential weights:" + ','.join(str(x) for x in r.potential_weights))) # End Print statements print "---------------------------------------------------------------------" #exit() print "stats:" print HITS_stats print "classifications: ", classifications, " out of a total: ", total print "Only weight and height: ", weight_and_height print "Atleast height: ", atleast_height