def answers_with_max_views(limit=20): answers = Answer.select().order_by(Answer.views.desc()).limit(limit) print('Top %d Most viewed answers - ' % limit) for answer in answers: u = ANSWER_URL.format(answer.question, answer.writer_uname).encode("UTF-8") print("{0} ({1} views)".format(u, answer.views))
def answers_with_no_upvotes(limit=20): answers = Answer.select().where(Answer.upvotes == 0).order_by(Answer.views.desc()).limit(limit) print("Answers max views but no upvotes - ") for answer in answers: u = ANSWER_URL.format(answer.question, answer.writer_uname).encode("UTF-8") print("{0} ({1} views)".format(u, answer.views))
def answers_with_max_views(limit=20): answers = Answer.select().order_by(Answer.views.desc()).limit(limit) print("Top %d Most viewed answers - " % limit) for answer in answers: u = ANSWER_URL.format(answer.question, answer.writer_uname).encode("UTF-8") print("{0} ({1} views)".format(u, answer.views))
def answers_with_no_upvotes(limit=20): answers = Answer.select().where(Answer.upvotes == 0).order_by( Answer.views.desc()).limit(limit) print('Answers max views but no upvotes - ') for answer in answers: u = ANSWER_URL.format(answer.question, answer.writer_uname).encode("UTF-8") print("{0} ({1} views)".format(u, answer.views))
def answers_with_max_upvotes(limit=20): answers = Answer.select().order_by(Answer.upvotes.desc()).limit(limit) print('Top %d Most upvoted answers - ' % limit) total_views = 0 total_upvotes = 0 writer = {} for answer in answers: u = ANSWER_URL.format(answer.question, answer.writer_uname).encode("UTF-8") print("{0} ({1}, {2}, {3})".format(u, answer.upvotes, answer.views, answer.views / answer.upvotes)) total_upvotes += answer.upvotes total_views += answer.views writer[answer.writer_uname] = writer.get(answer.writer_uname, 0) + 1 print("Total Views = {0}".format(total_views)) print("Total Upvotes = {0}".format(total_upvotes)) print("Average Views = {0}".format(total_views / limit)) print("Average Upvotes = {0}".format(total_upvotes / limit)) avg_up = (float(total_upvotes) / float(total_views)) * 100 print("On an average %.2f viewers upvoted the answer" % avg_up) # Writer Stat with open('top_writers_2016.json', 'r') as fstream: writer_list = json.load(fstream) notw = 0 for w in writer_list: if w['uname'] in writer: notw += 1 print("{0} People on this list are Top Writers(2016)".format(notw)) sorted_writer = sorted(writer.items(), key=operator.itemgetter(1), reverse=True) print("Total number of unique writers is {0}".format(len(sorted_writer))) total_followers = 0 total_answers = 0 for tup in sorted_writer: profile = Profile.get(Profile.uname == tup[0]) total_followers += int(profile.followers) total_answers += int(profile.total_answers) print("Average number of followers of each {0}".format( total_followers / len(sorted_writer))) print("Average number of answers written by each is {}".format( total_answers / len(sorted_writer))) # Plotting Graph figure = mpplt.figure(figsize=(10, 10)) plt = figure.add_subplot(1, 1, 1) plt.set_title("Views vs Upvote") plt.plot([answer.views for answer in answers], [answer.upvotes for answer in answers], '.', color='green') plt.set_xlabel('Views') plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) plt.set_xlim([0, 1500000]) plt.set_ylim([10000, 25000]) plt.set_ylabel('Upvotes') figure.savefig('view_upvote.png', facecolor='white', edgecolor='black')
def parse_profile(profile): # Fetching profile resp = get_page(PROFILE_URL.format(profile.uname)) if resp == 404: delete_profile(profile) return doc = BeautifulSoup(resp, 'html.parser', parse_only=PROFILE_STRAIN) # Get Answer List and save in database answer_list = doc.find_all('div', class_=ANSWER_LIST_CLASS) for answer in answer_list: question_tag = answer.find('a', class_=QUESTION_LINK_CLASS) Answer.create_or_get(question=question_tag['href'], writer=profile) # Update the profile object with data parsed stats = doc.find_all('span', class_=FEED_NUM_CLASS) profile.name = doc.find('div', class_='ProfileNameAndSig').h1.span.string profile.total_answers = int(stats[0].string.replace(',', '')) profile.questions = int(stats[1].string.replace(',', '')) profile.posts = int(stats[2].string.replace(',', '')) profile.followers = int(stats[3].string.replace(',', '')) profile.following = int(stats[4].string.replace(',', '')) profile.edits = int(stats[5].string.replace(',', '')) element = doc.find('a', class_=ANSWER_VIEW_CLASS) if element is None: profile.views = 0 else: profile.views = int( element.find_all('span')[2].string.replace(',', '')) profile.last_parsed = datetime.datetime.now() profile.save() # Saving the HTML code of the profile filename = profile.uname + '.html' with open(os.path.join(PROFILE_FOLDER, filename), 'w+') as fstream: fstream.write(resp) try: sys.stdout.write(u'\rDone Parsing = %s %d ' % (profile.uname, len(answer_list))) sys.stdout.flush() except UnicodeEncodeError: pass
def parse_profile(profile): # Fetching profile resp = get_page(PROFILE_URL.format(profile.uname)) if resp == 404: delete_profile(profile) return doc = BeautifulSoup(resp, 'html.parser', parse_only=PROFILE_STRAIN) # Get Answer List and save in database answer_list = doc.find_all('div', class_=ANSWER_LIST_CLASS) for answer in answer_list: question_tag = answer.find('a', class_=QUESTION_LINK_CLASS) Answer.create_or_get(question=question_tag['href'], writer=profile) # Update the profile object with data parsed stats = doc.find_all('span', class_=FEED_NUM_CLASS) profile.name = doc.find('div', class_='ProfileNameAndSig').h1.span.string profile.total_answers = int(stats[0].string.replace(',', '')) profile.questions = int(stats[1].string.replace(',', '')) profile.posts = int(stats[2].string.replace(',', '')) profile.followers = int(stats[3].string.replace(',', '')) profile.following = int(stats[4].string.replace(',', '')) profile.edits = int(stats[5].string.replace(',', '')) element = doc.find('a', class_=ANSWER_VIEW_CLASS) if element is None: profile.views = 0 else: profile.views = int(element.find_all('span')[2].string.replace(',', '')) profile.last_parsed = datetime.datetime.now() profile.save() # Saving the HTML code of the profile filename = profile.uname + '.html' with open(os.path.join(PROFILE_FOLDER, filename), 'w+') as fstream: fstream.write(resp) try: sys.stdout.write(u'\rDone Parsing = %s %d ' % (profile.uname, len(answer_list))) sys.stdout.flush() except UnicodeEncodeError: pass
def add_answerquestion(id_ques): answer_question = id_ques answer_content = request.json['answer_content'] answer_vote = 0 new_answerquestion = Answer(answer_question, answer_content, answer_vote) db.session.add(new_answerquestion) db.session.commit() return answer_schema.jsonify(new_answerquestion)
def answers_with_max_upvotes(limit=20): answers = Answer.select().order_by(Answer.upvotes.desc()).limit(limit) print("Top %d Most upvoted answers - " % limit) total_views = 0 total_upvotes = 0 writer = {} for answer in answers: u = ANSWER_URL.format(answer.question, answer.writer_uname).encode("UTF-8") print("{0} ({1}, {2}, {3})".format(u, answer.upvotes, answer.views, answer.views / answer.upvotes)) total_upvotes += answer.upvotes total_views += answer.views writer[answer.writer_uname] = writer.get(answer.writer_uname, 0) + 1 print("Total Views = {0}".format(total_views)) print("Total Upvotes = {0}".format(total_upvotes)) print("Average Views = {0}".format(total_views / limit)) print("Average Upvotes = {0}".format(total_upvotes / limit)) avg_up = (float(total_upvotes) / float(total_views)) * 100 print("On an average %.2f viewers upvoted the answer" % avg_up) # Writer Stat with open("top_writers_2016.json", "r") as fstream: writer_list = json.load(fstream) notw = 0 for w in writer_list: if w["uname"] in writer: notw += 1 print("{0} People on this list are Top Writers(2016)".format(notw)) sorted_writer = sorted(writer.items(), key=operator.itemgetter(1), reverse=True) print("Total number of unique writers is {0}".format(len(sorted_writer))) total_followers = 0 total_answers = 0 for tup in sorted_writer: profile = Profile.get(Profile.uname == tup[0]) total_followers += int(profile.followers) total_answers += int(profile.total_answers) print("Average number of followers of each {0}".format(total_followers / len(sorted_writer))) print("Average number of answers written by each is {}".format(total_answers / len(sorted_writer))) # Plotting Graph figure = mpplt.figure(figsize=(10, 10)) plt = figure.add_subplot(1, 1, 1) plt.set_title("Views vs Upvote") plt.plot([answer.views for answer in answers], [answer.upvotes for answer in answers], ".", color="green") plt.set_xlabel("Views") plt.ticklabel_format(style="sci", axis="x", scilimits=(0, 0)) plt.ticklabel_format(style="sci", axis="y", scilimits=(0, 0)) plt.set_xlim([0, 1500000]) plt.set_ylim([10000, 25000]) plt.set_ylabel("Upvotes") figure.savefig("view_upvote.png", facecolor="white", edgecolor="black")
def addAnswer(self, id, questionId, responseId, content): engine = create_engine('sqlite:///database.db') Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) session = DBSession() newAnswer = Answer(id=id, questionId=questionId, responseId=responseId, content=content) session.add(newQuestion) session.commit() session.close()
def parse_profile(profile): # Fetching profile resp = get_page(PROFILE_URL.format(profile.uname)) doc = BeautifulSoup(resp, 'html.parser') # Get Answer List and save in database answer_list = doc.find_all('div', class_=ANSWER_LIST_CLASS) for answer in answer_list: question_tag = answer.find('a', class_=QUESTION_LINK_CLASS) Answer.create_or_get(question=question_tag['href'], writer=profile) # Update the profile object with data parsed stats = doc.find_all('span', class_=FEED_NUM_CLASS) if len(stats) < 5: with open('error.html', 'w') as fstream: fstream.write(resp) print "can't find - ", profile.uname, stats profile.total_answers = int(stats[0].string.replace(',', '')) profile.questions = int(stats[1].string.replace(',', '')) profile.followers = int(stats[3].string.replace(',', '')) profile.following = int(stats[4].string.replace(',', '')) element = doc.find('a', class_=ANSWER_VIEW_CLASS) if element is None: profile.views = 0 else: profile.views = int(element.find_all('span')[2].string.replace(',', '')) profile.last_parsed = datetime.datetime.now() profile.save() # Saving the HTML code of the profile filename = profile.uname + '.html' with open(os.path.join(PROFILE_FOLDER, filename), 'w+') as fstream: fstream.write(resp) sys.stdout.write(u'\rDone Parsing = %s %d ' % (profile.uname, len(answer_list))) sys.stdout.flush()
def parse_submit_answer(info_dict): # Parse and create submit object / answer objects submit_obj = Submit(survey_id=info_dict['survey_id'], submit_ip=info_dict['submit_ip'], username=info_dict['username'], submit_date=info_dict['submit_date']) db.session.add(submit_obj) db.session.flush() submit_id = submit_obj.index # Create answer objects for answer, question in zip(info_dict['answers'], info_dict['questions']): answer_obj = Answer(question_id=question['index'], submit_id=submit_id, survey_id=info_dict['survey_id'], type=question['type'], content=json.dumps(answer)) db.session.add(answer_obj) db.session.commit()
Profile.last_parsed <= old_time).limit(max_crawl - total_parsing) total_parsing += len(old_profiles) print "Number of Profiles to Crawl - ", len(old_profiles) for profile in old_profiles: try: parse_profile(profile) except Exception, err: #pylint-diasble: print err traceback.print_exc(file=sys.stdout) pass if total_parsing >= max_crawl: break if not args.no_answer: # Parse Old Answers old_time = datetime.datetime.now() - datetime.timedelta(days=7) old_answers = Answer.select().where( Answer.last_parsed <= old_time).limit(max_crawl - total_parsing) total_parsing += len(old_answers) print "Number of Answers to Crawl - ", len(old_answers) for answer in old_answers: try: parse_answer(answer) except Exception, err: print err traceback.print_exc(file=sys.stdout) pass print '\n'
old_time = datetime.datetime.now() - datetime.timedelta(days=7) old_profiles = Profile.select().where( Profile.last_parsed <= old_time).limit(max_crawl - total_parsing) total_parsing += len(old_profiles) print "Number of Profiles to Crawl - ", len(old_profiles) for profile in old_profiles: try: parse_profile(profile) except Exception, err: #pylint-diasble: print err traceback.print_exc(file=sys.stdout) pass if total_parsing >= max_crawl: break if not args.no_answer: # Parse Old Answers old_time = datetime.datetime.now() - datetime.timedelta(days=7) old_answers = Answer.select().where( Answer.last_parsed <= old_time).limit(max_crawl - total_parsing) total_parsing += len(old_answers) print "Number of Answers to Crawl - ", len(old_answers) for answer in old_answers: try: parse_answer(answer) except Exception, err: print err traceback.print_exc(file=sys.stdout) pass print '\n'
def best_quora_answers(limit): answers = Answer.select().where(Answer.upvotes >= 1000) avg_upvotes = sum([answer.upvotes for answer in answers]) avg_upvotes /= float(len(answers)) print("Average Upvotes = %d" % avg_upvotes) print("%d answers to evaluate = " % len(answers)) view_upvote_ratio = [] # Each object = [answer_id, view / upvote ratio] for idx, answer in enumerate(answers): writer = answer.writer view_avg = float(writer.views) / writer.total_answers quality_index = 0 quality_index = view_avg / answer.views quality_index += answer.upvotes / avg_upvotes quality_index += float(answer.upvotes * 50) / float(answer.views) view_upvote_ratio.append([idx, quality_index]) view_upvote_ratio.sort(key=lambda x: x[1], reverse=True) limit = min(limit, len(view_upvote_ratio)) print ("Printing %d answers" % limit) print("Top %d answers on Quora are :" % limit) view_upvote_ratio = view_upvote_ratio[:limit] total_qi = 0 total_views = 0 total_upvotes = 0 writer = {} upvote_count = [] for item in view_upvote_ratio: answer = answers[item[0]] u = ANSWER_URL.format(answer.question, answer.writer_uname) print(u"%s (%.2f, %d, %d)" % (u, item[1], answer.views, answer.upvotes)) total_qi += item[1] total_views += answer.views upvote_count.append(answer.upvotes) writer[answer.writer_uname] = writer.get(answer.writer_uname, 0) + 1 with open('top_writers_2016.json', 'r') as fstream: writer_list = json.load(fstream) notw = 0 for w in writer_list: if w['uname'] in writer: notw += 1 print("{0} People on this list are Top Writers(2016)".format(notw)) sorted_writer = sorted(writer.items(), key=operator.itemgetter(1), reverse=True) print sorted_writer[:10] print("Total number of unique writers is {0}".format(len(sorted_writer))) avg_qi = total_qi / limit total_upvotes = sum(upvote_count) avg_upvote = total_upvotes / limit avg_view = total_views / limit print("Average of Quality Index = %.2f" % avg_qi) print("Average number of views = %d" % avg_view) print("Average number of upvotes = %d" % avg_upvote) fig = plt.figure(figsize=(21, 14)) plt.title("Quality Answers on Quora (Feb 2016)") # Scaling quality max_q = max([item[1] for item in view_upvote_ratio]) min_q = min([item[1] for item in view_upvote_ratio]) diff_y = max_q - min_q ydata = [] for i in view_upvote_ratio: ydata.append((i[1] - min_q) / diff_y) xticks = [str(i) for i in range(1, limit + 1)] plt.xlabel('Answer Rank') plt.ylabel('Quality Index') # Scaling Upvote Count max_up = max(upvote_count) min_up = min(upvote_count) diff = max_up - min_up for idx, up in enumerate(upvote_count): upvote_count[idx] = float(up - min_up) / diff yvals = [] yticks = [] for i in range(1, 10): yvals.append(i * 0.1) yticks.append("%.2f" % (min_q + (i * 0.1) * diff_y)) yvals.append(i * -0.1) yticks.append(min_up + (i * 0.1) * diff) plt.bar(range(limit), ydata, align='center', color='y') plt.bar(range(limit), [-up for up in upvote_count], align='center', color='g') plt.xticks(range(limit), xticks) plt.yticks(yvals, yticks) plt.hlines(sum(ydata) / float(limit), -0.5, limit - 0.5, label='Average Quality Index', colors='b') plt.hlines(-sum(upvote_count) / float(limit), -0.5, limit - 0.5, label='Average Upvote Count', color='r') plt.xlim([-0.5, limit - 0.5]) #fig.tight_layout() plt.ylim([-1.1, 1.1]) plt.legend() plt.tight_layout() plt.savefig('quality_answers.png', facecolor='white', edgecolor='black')
def answers_with_max_upvotes(limit=20): answers = Answer.select().order_by(Answer.upvotes.desc()).limit(limit) print('Top %d Most upvoted answers - ' % limit) total_views = 0 total_upvotes = 0 writer = {} max_views = 0 for answer in answers: u = ANSWER_URL.format(answer.question, answer.writer_uname).encode("UTF-8") print("{0} ({1}, {2}, {3})".format(u, answer.upvotes, answer.views, answer.views / answer.upvotes)) total_upvotes += answer.upvotes total_views += answer.views writer[answer.writer_uname] = writer.get(answer.writer_uname, 0) + 1 if answer.views > max_views : max_views = answer.views print "Max - ", u, answer.views print("Total Views = {0}".format(total_views)) print("Total Upvotes = {0}".format(total_upvotes)) print("Average Views = {0}".format(total_views / limit)) print("Average Upvotes = {0}".format(total_upvotes / limit)) avg_up = (float(total_upvotes) / float(total_views)) * 100 print("On an average %.2f viewers upvoted the answer" % avg_up) # Writer Stat with open('top_writers_2016.json', 'r') as fstream: writer_list = json.load(fstream) notw = 0 for w in writer_list: if w['uname'] in writer: notw += 1 print("{0} People on this list are Top Writers(2016)".format(notw)) sorted_writer = sorted(writer.items(), key=operator.itemgetter(1), reverse=True) print sorted_writer[:10] print("Total number of unique writers is {0}".format(len(sorted_writer))) total_followers = 0 total_answers = 0 for tup in sorted_writer: profile = Profile.get(Profile.uname == tup[0]) total_followers += int(profile.followers) total_answers += int(profile.total_answers) print("Average number of followers of each {0}".format( total_followers / len(sorted_writer))) print("Average number of answers written by each is {}".format( total_answers / len(sorted_writer))) # Plotting Graph figure = plt.figure(figsize=(10, 10)) splt = figure.add_subplot(1, 1, 1) splt.set_title("Views vs Upvote") splt.plot([answer.views for answer in answers], [answer.upvotes for answer in answers], '.', color='green') splt.set_xlabel('Views') splt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) splt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) splt.set_xlim([0, 1500000]) splt.set_ylim([10000, 25000]) splt.set_ylabel('Upvotes') figure.tight_layout() figure.savefig('view_upvote.png', facecolor='white', edgecolor='black')
def insert_data(evaluations): for evaluation in evaluations: evaluation_data = {} now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # course info located in first row, first td row = evaluation.xpath('.//tr/td')[0] evaluation_data["source"] = parser.get_data_source(evaluation) evaluation_data["instructor"] = parser.get_instructor_name(row) evaluation_data["course_name"] = parser.get_course_name(row) evaluation_data["semester_full"] = parser.get_full_semester(row) evaluation_data["semester"] = parser.get_semester( evaluation_data["semester_full"]) evaluation_data["year"] = parser.get_year( evaluation_data["semester_full"]) evaluation_data["reg_index"] = parser.get_reg_index(row) evaluation_data["course_code"] = parser.get_course_code(row) evaluation_data['school'] = parser.get_school( evaluation_data["course_code"]) evaluation_data['department'] = parser.get_department( evaluation_data["course_code"]) evaluation_data['course'] = parser.get_course( evaluation_data["course_code"]) evaluation_data['section'] = parser.get_section( evaluation_data["course_code"]) evaluation_data["enrollments"] = parser.get_num_enrollments(row) evaluation_data["responses"] = parser.get_num_responses(row) evals = parser.get_eval_rows(evaluation) parser.get_school(evaluation_data["course_code"]) course_session = session() course = Course(course_name=evaluation_data["course_name"], year=evaluation_data["year"], semester=evaluation_data["semester"], school=evaluation_data['school'], department=evaluation_data['department'], course=evaluation_data['course'], section=evaluation_data['section'], regindex=evaluation_data["reg_index"], source=evaluation_data['source'], enrollments=evaluation_data['enrollments'], responses=evaluation_data['responses'], instructor=evaluation_data["instructor"], created_at=now) course_session.close() course_id = add_course(course_session, course) if course_id == -1: print("Skipping course [{}] - {} - {}".format( course_id, evaluation_data["course_name"], evaluation_data["instructor"])) else: print("Added course [{}] - {} - {}".format( course_id, evaluation_data["course_name"], evaluation_data["instructor"])) evaluation_data["questions"] = evals #print("Instructor {} : {}".format(evaluation_data["instructor"], evals)) evals_json = json.dumps(evaluation_data) evals_json = json.loads(evals_json) for e in evals_json['questions']: #print(evals_json['questions'][e]) q_text = evals_json['questions'][e]['question_text'] q_type = evals_json['questions'][e]['question_type'] responses = evals_json['questions'][e]['response'] question_session = session() question = Question(question_text=bytearray(q_text, 'utf-8'), question_type=q_type, created_at=now) q_id = add_question(question_session, question) question_session.close() answer_session = session() rate_1, rate_2, rate_3, rate_4, rate_5, blank = 0, 0, 0, 0, 0, 0 responses = OrderedDict(sorted(responses.items())) for indx, inc in enumerate(responses): if indx == 0: rate_1 = responses[inc] elif indx == 1: rate_2 = responses[inc] elif indx == 2: rate_3 = responses[inc] elif indx == 3: rate_4 = responses[inc] elif indx == 4: rate_5 = responses[inc] elif indx == 5: blank = responses[inc] answer = Answer(course_id=course_id, question_id=q_id, rating_1=rate_1, rating_2=rate_2, rating_3=rate_3, rating_4=rate_4, rating_5=rate_5, blank=blank, created_at=now) add_answer(answer_session, answer) answer_session.close()