def fetch_submissions(start_epoch, end_epoch, subreddits, csv_writer): r = praw.Reddit(user_agent=REDDIT_USER_AGENT) seconds_to_increment = SECONDS_IN_A_DAY * FLAGS.step_days total_records_fetched = 0 for subreddit in subreddits: for i in xrange(start_epoch, end_epoch, seconds_to_increment): segment_start = i segment_end = segment_start + min(seconds_to_increment, end_epoch - i) query = 'timestamp:%d..%d' % (segment_start, segment_end) results = list(r.search(query, subreddit=subreddit, sort='new', limit=None, syntax='cloudsearch')) if len(results) > RESULTS_CORRECTNESS_CHECK: print ("WARNING: received %i results. This is dangerously close" "to the max number of allowed results (1000)." % ( len(results))) for result in results: # We store the score as a string because we will have to write # it. submission = Submission( result.id, result.title, result.selftext, result.url, result.permalink, unicode(result.score), subreddit) # submission = submission_fields_as_strings(submission) # unicode_row = [s.encode('utf-8') for s in submission] unicode_row = submission_to_unicode(submission) csv_writer.writerow(unicode_row) total_records_fetched += len(results) segment_start_string = epoch_to_date_string(segment_start) segment_end_string = epoch_to_date_string(segment_end) print "[%s] - from %s to %s fetched %i results. Total submissions saved: %i" % ( subreddit, segment_start_string, segment_end_string, len(results), total_records_fetched)
def fetch_submissions(start_epoch, end_epoch, subreddits, csv_writer): r = praw.Reddit(user_agent=REDDIT_USER_AGENT) seconds_to_increment = SECONDS_IN_A_DAY * FLAGS.step_days total_records_fetched = 0 for subreddit in subreddits: for i in xrange(start_epoch, end_epoch, seconds_to_increment): segment_start = i segment_end = segment_start + min(seconds_to_increment, end_epoch - i) query = 'timestamp:%d..%d' % (segment_start, segment_end) results = list( r.search(query, subreddit=subreddit, sort='new', limit=None, syntax='cloudsearch')) if len(results) > RESULTS_CORRECTNESS_CHECK: print( "WARNING: received %i results. This is dangerously close" "to the max number of allowed results (1000)." % (len(results))) for result in results: # We store the score as a string because we will have to write # it. submission = Submission(result.id, result.title, result.selftext, result.url, result.permalink, unicode(result.score), subreddit) # submission = submission_fields_as_strings(submission) # unicode_row = [s.encode('utf-8') for s in submission] unicode_row = submission_to_unicode(submission) csv_writer.writerow(unicode_row) total_records_fetched += len(results) segment_start_string = epoch_to_date_string(segment_start) segment_end_string = epoch_to_date_string(segment_end) print "[%s] - from %s to %s fetched %i results. Total submissions saved: %i" % ( subreddit, segment_start_string, segment_end_string, len(results), total_records_fetched)
def run(): # Test apparatus: # Open a file, read the titles, try to extract content, output CSV with # extracted content. # TODO: I would really like to move this logic into a well-designed class # for general-purpose .yaml based extractors. rules = yaml.load(open('rules.yaml', 'r')) variables = rules['variables'] feature_extractors_single = get_feature_extractor( 'feature_extractors_single', rules, variables) feature_extractors_list = get_feature_extractor('feature_extractors_list', rules, variables) feature_extractors_boolean = get_feature_extractor( 'feature_extractors_boolean', rules, variables) input_file = open(FLAGS.input, 'r') output_file = open(FLAGS.output, 'w') csv_writer = csv.writer(output_file) csv_writer.writerow(ProcessedSubmission._fields) csv_reader = csv.reader(input_file) csv_reader.next() # skip the first row because of headers number_completed_rows = 0 for row in csv_reader: submission = submission_from_csv_row(row) all_extracted_variables = {} text_to_process = { 'title': submission.title, 'selftext': submission.selftext, 'url': submission.url } debug_mode = False if FLAGS.debug_id == submission.id: print "Debug printing for --debug_id=%s" % (FLAGS.debug_id) debug_mode = True for location, text in text_to_process.iteritems(): # Extract single features # To support unicode characters in the text we are trying to parse: # UGH...can't wait till gflags supports Python3... text = text.decode('utf-8') for extractor_name, extractor_dict in feature_extractors_in_order( feature_extractors_single): order = extractor_dict['order'] if location not in extractor_dict['locations']: continue pattern = extractor_dict['pattern'] if debug_mode: print "extractor [%i, %s]" % (order, extractor_name) print " text: ", text print " pattern:", pattern pattern = pattern.decode('utf-8') m = re.search(pattern, text, re.UNICODE) if m is not None: extracted_variables = m.groupdict() # We do this funky thing here because we want the existing # values in all_extracted_variables to have priority over # the new values we are adding. This allows us to order # our feature extractors in order from most confident to # least confident without worries of adding features from # less confident feature extractors. if debug_mode: print " extractor returned:", extracted_variables extracted_variables.update(all_extracted_variables) all_extracted_variables = extracted_variables else: if debug_mode: print " extractor returned no matches." if debug_mode: print "----------------------------------------------" # print "Extracted!", all_extracted_variables #print "text: ", text #print "pattern: ", pattern # Extract feature lists: for extractor_name, extractor_dict in feature_extractors_in_order( feature_extractors_list): if location not in extractor_dict['locations']: continue pattern = extractor_dict['pattern'] pattern = pattern.decode('utf-8') matches = re.findall(pattern, text, re.UNICODE) assert isinstance(matches, list) if extractor_name not in all_extracted_variables: all_extracted_variables[extractor_name] = set() for match in matches: all_extracted_variables[extractor_name].add(match) # Set Boolean features for extractor_name, extractor_dict in feature_extractors_in_order( feature_extractors_boolean): if location not in extractor_dict['locations']: continue pattern = extractor_dict['pattern'] # By default, we set all features to False if extractor_name not in all_extracted_variables: all_extracted_variables[extractor_name] = False pattern = pattern.decode('utf-8') m = re.search(pattern, text, re.UNICODE) if m is not None: all_extracted_variables[extractor_name] = True # print "all_extracted_variables: ", all_extracted_variables (complete, height_in, start_weight_lbs, end_weight_lbs, gender_is_female, age, imgur_images, imgur_albums) = process_extracted_variables(all_extracted_variables, debug_mode) if debug_mode: print all_extracted_variables if complete: number_completed_rows += 1 if FLAGS.output_only_complete and not complete: continue processed_submission = ProcessedSubmission( complete, height_in, start_weight_lbs, end_weight_lbs, gender_is_female, age, imgur_images, imgur_albums, *submission) # Output to csv csv_writer.writerow(submission_to_unicode(processed_submission)) input_file.close() output_file.close() print "number_completed_rows: ", number_completed_rows
def run(): # Test apparatus: # Open a file, read the titles, try to extract content, output CSV with # extracted content. # TODO: I would really like to move this logic into a well-designed class # for general-purpose .yaml based extractors. rules = yaml.load(open('rules.yaml', 'r')) variables = rules['variables'] feature_extractors_single = get_feature_extractor('feature_extractors_single', rules, variables) feature_extractors_list = get_feature_extractor('feature_extractors_list', rules, variables) feature_extractors_boolean = get_feature_extractor('feature_extractors_boolean', rules, variables) input_file = open(FLAGS.input, 'r') output_file = open(FLAGS.output, 'w') csv_writer = csv.writer(output_file) csv_writer.writerow(ProcessedSubmission._fields) csv_reader = csv.reader(input_file) csv_reader.next() # skip the first row because of headers number_completed_rows = 0 for row in csv_reader: submission = submission_from_csv_row(row) all_extracted_variables = {} text_to_process = {'title': submission.title, 'selftext': submission.selftext, 'url':submission.url} debug_mode = False if FLAGS.debug_id == submission.id: print "Debug printing for --debug_id=%s" % (FLAGS.debug_id) debug_mode = True for location, text in text_to_process.iteritems(): # Extract single features # To support unicode characters in the text we are trying to parse: # UGH...can't wait till gflags supports Python3... text = text.decode('utf-8') for extractor_name, extractor_dict in feature_extractors_in_order(feature_extractors_single): order = extractor_dict['order'] if location not in extractor_dict['locations']: continue pattern = extractor_dict['pattern'] if debug_mode: print "extractor [%i, %s]" % (order, extractor_name) print " text: ", text print " pattern:", pattern pattern = pattern.decode('utf-8') m = re.search(pattern, text, re.UNICODE) if m is not None: extracted_variables = m.groupdict() # We do this funky thing here because we want the existing # values in all_extracted_variables to have priority over # the new values we are adding. This allows us to order # our feature extractors in order from most confident to # least confident without worries of adding features from # less confident feature extractors. if debug_mode: print " extractor returned:", extracted_variables extracted_variables.update(all_extracted_variables) all_extracted_variables = extracted_variables else: if debug_mode: print " extractor returned no matches." if debug_mode: print "----------------------------------------------" # print "Extracted!", all_extracted_variables #print "text: ", text #print "pattern: ", pattern # Extract feature lists: for extractor_name, extractor_dict in feature_extractors_in_order(feature_extractors_list): if location not in extractor_dict['locations']: continue pattern = extractor_dict['pattern'] pattern = pattern.decode('utf-8') matches = re.findall(pattern, text, re.UNICODE) assert isinstance(matches, list) if extractor_name not in all_extracted_variables: all_extracted_variables[extractor_name] = set() for match in matches: all_extracted_variables[extractor_name].add(match) # Set Boolean features for extractor_name, extractor_dict in feature_extractors_in_order(feature_extractors_boolean): if location not in extractor_dict['locations']: continue pattern = extractor_dict['pattern'] # By default, we set all features to False if extractor_name not in all_extracted_variables: all_extracted_variables[extractor_name] = False pattern = pattern.decode('utf-8') m = re.search(pattern, text, re.UNICODE) if m is not None: all_extracted_variables[extractor_name] = True # print "all_extracted_variables: ", all_extracted_variables (complete, height_in, start_weight_lbs, end_weight_lbs,gender_is_female, age, imgur_images, imgur_albums) = process_extracted_variables( all_extracted_variables, debug_mode) if debug_mode: print all_extracted_variables if complete: number_completed_rows += 1 if FLAGS.output_only_complete and not complete: continue processed_submission = ProcessedSubmission( complete, height_in, start_weight_lbs, end_weight_lbs, gender_is_female, age, imgur_images, imgur_albums, *submission) # Output to csv csv_writer.writerow(submission_to_unicode(processed_submission)) input_file.close() output_file.close() print "number_completed_rows: ", number_completed_rows