def so_special(treatment_feature, extra_filter): """helper""" if extra_filter: qs1 = SampledStackOverflowPost.objects.filter( has_wiki_link=True, sample_num__in=[0, 1, 2], has_c_wiki_link=True).order_by('uid') qs2 = SampledStackOverflowPost.objects.filter( has_wiki_link=True, sample_num__in=[0, 1, 2], has_c_wiki_link=False).order_by('uid') else: qs1 = SampledStackOverflowPost.objects.filter( sample_num=0, has_wiki_link=True).order_by('uid') qs2 = SampledStackOverflowPost.objects.filter( sample_num=0, has_wiki_link=False).order_by('uid') treat_question_ids = [] control_question_ids = [] start_time = time.time() count = defaultdict(int) treat = [] control = [] for start, end, total, batch in batch_qs(qs1, batch_size=10000): print('qs1', start, end, total, time.time() - start_time) for obj in batch: ans = StackOverflowAnswer.objects.using('secondary').get( id=obj.uid) question_id = ans.parent_id if question_id not in treat_question_ids: treat.append(obj.num_pageviews) count['treatment_total'] += obj.num_pageviews count['treatment_count'] += 1 treat_question_ids.append(question_id) else: count['dropped_treatment_total'] += obj.num_pageviews count['dropped_treatment_count'] += 1 for start, end, total, batch in batch_qs(qs2, batch_size=10000): print('qs2', start, end, total, time.time() - start_time) for obj in batch: ans = StackOverflowAnswer.objects.using('secondary').get( id=obj.uid) question_id = ans.parent_id if question_id in treat_question_ids: count['dropped_control_total'] += obj.num_pageviews count['dropped_control_count'] += 1 continue if question_id not in control_question_ids: control.append(obj.num_pageviews) count['control_total'] += obj.num_pageviews count['control_count'] += 1 control_question_ids.append(question_id) else: count['dropped_control_total'] += obj.num_pageviews count['dropped_control_count'] += 1 print(count) return treat, control
def tags_frequency_distribution(qs): """ Takes a qs and figure out which tags to links are found in This is more complicated than the generic frequency distribution because each post can have as many tags as desired by users """ num_threads = qs.count() title = 'Identifying tag distribution for {} threads'.format(num_threads) print(title) tag_to_count = defaultdict(int) qs = qs.order_by('uid') # start, end, total for start, end, total, batch in batch_qs(qs, num_threads, 1000): print('Processing threads {} to {} of {}'.format(start, end, total)) for thread in batch: tags = thread.tags_string.split('|') for tag in tags: tag_to_count[tag] += 1 sorted_tag_to_count = sorted(tag_to_count.items(), key=operator.itemgetter(1), reverse=True) rows = [] for i, val_tup in enumerate(sorted_tag_to_count[:25]): val = val_tup[0] count = tag_to_count[val] percent = count / num_threads * 100 print(i, val_tup, percent) rows.append([i, val_tup, percent]) with open(title, 'w', newline='') as outfile: writer = csv.writer(outfile) writer.writerows(rows)
def bulk_save(): """Runs through all the rows and re-saves to trigger computation""" reddit = SampledRedditThread.objects.all().order_by('uid') stack = SampledStackOverflowPost.objects.all().order_by('uid') start_time = time.time() for start, end, total, batch in batch_qs(reddit, batch_size=10000): print('reddit', start, end, total, time.time() - start_time) for item in batch: item.save() start = time.time() for start, end, total, batch in batch_qs(stack, batch_size=10000): print('stack', start, end, total, time.time() - start_time) for item in batch: item.save()
def get_method_outputs(qs): """Call the model method and return list of results""" vals = [] qs = qs.order_by('uid') for _, _, _, batch in batch_qs(qs): for item in batch: vals.append(getattr(item, method_name)()) return vals
def save_posts(sample_num=None): """ Re compute Wiki link rows """ print('saving posts... (slow)') if sample_num is None: sample_num = [0, 1, 2] else: sample_num = sample_num.split(',') sample_num = [int(x) for x in sample_num] reddit = SampledRedditThread.objects.filter( has_wiki_link=True, sample_num__in=sample_num).order_by('uid') for start, end, total, batch in batch_qs(reddit): print('reddit', start, end, total) for item in batch: item.save() stack = SampledStackOverflowPost.objects.filter( has_wiki_link=True, sample_num__in=sample_num).order_by('uid') for start, end, total, batch in batch_qs(stack): print('stack', start, end, total) for item in batch: item.save()
def extract_vals_and_method_results(qs, field_names): """Extract either stored values or method results from a django QS""" rows = [] for _, _, _, batch in batch_qs(qs, batch_size=1000): for obj in batch: row = [] for field_name in field_names: try: val = getattr(obj, field_name)() except TypeError: val = getattr(obj, field_name) row.append(val) rows.append(row) return rows
def frequency_distribution(qs, field, qs_name, extractor=None): """ Takes a qs a figure out which base urls the links go to """ num_threads = qs.count() title = 'Frequency Distribution of {} in subset "{}" ({} threads)'.format( field, qs_name, num_threads) filename = "{}_{}_{}.csv".format(field, qs_name, num_threads) print(title) val_to_count = defaultdict(int) qs = qs.order_by('uid') # start, end, total start_time = time.time() for start, end, total, batch in batch_qs(qs, num_threads, 10000): stamp = time.time() for thread in batch: vals = [getattr(thread, field)] if extractor is not None: vals = extractor(vals[0]) for val in vals: val_to_count[val] += 1 print('Finished threads {} to {} of {}. Took {}'.format( start, end, total, time.time() - stamp)) print('Running time: {}'.format(time.time() - start_time)) print(len(val_to_count.keys())) sorted_val_to_count = sorted(val_to_count.items(), key=operator.itemgetter(1), reverse=True) plot_bar(sorted_val_to_count[:20], title, filename) rows = [] for i, val_tup in enumerate(sorted_val_to_count): count = val_to_count[val_tup[0]] percent = count / num_threads * 100 print(i, val_tup, percent) rows.append([i, val_tup, percent]) with open('csv_files/' + filename, 'w', newline='') as outfile: writer = csv.writer(outfile) writer.writerows(rows)
def mark_top_answers(): """marks top answers""" qs = SampledStackOverflowPost.objects.all().order_by('uid') for start, end, total, batch in batch_qs(qs, batch_size=10000): print(start, end, total) for answer in batch: try: question_id = StackOverflowAnswer.objects.using( 'secondary').filter( id=answer.uid).values('parent_id')[0]['parent_id'] other_answers = StackOverflowAnswer.objects.using( 'secondary').filter(parent_id=question_id) max_score = other_answers.aggregate(Max('score'))['score__max'] print(max_score) if answer.score == max_score: print('marking top answer as true woohoo!') answer.is_top = True answer.save() except Exception as err: print(err) print('MISSING QUESTION UH OH')
def main(do_all=False): """driver""" reddit = praw.Reddit(client_id=os.environ["CLIENT_ID"], client_secret=os.environ["CLIENT_SECRET"], user_agent=os.environ["UA"]) processor = give_author_processor(reddit) if do_all: print('Reprocessing all reddit authors') qs = SampledRedditThread.objects.all() else: print('will only process new samples') qs = SampledRedditThread.objects.filter(user_info_processed=False) qs = qs.order_by('uid') while qs.exists(): start_time = time.time() for start, end, total, batch in batch_qs(qs): print(start, end, total, time.time() - start_time) for thread in batch: author_dict = processor(thread.author) for key, val in author_dict.items(): setattr(thread, key, val) thread.user_info_processed = True thread.save()