def parse_answer(answer): # Fetching answer resp = get_page(ANSWER_URL.format(answer.question, answer.writer_uname)) doc = BeautifulSoup(resp, 'html.parser', parse_only=ANSWER_STRAIN) # Get Credible Users who have upvoted this answer users = doc.find('div', class_=CREDIBILITY_CLASS).find_all('a', class_='user') for user in users: Profile.create_or_get(uname=user['href'].split('/')[2], name=user.string) # Update answer stats answer.views = int( doc.find('div', class_=VIEW_ROW_CLASS).strong.string.replace(',', '')) answer.upvotes = int( doc.find('a', class_=UPVOTE_ROW_CLASS).strong.string.replace(',', '')) answer.last_parsed = datetime.datetime.now() answer.save() # Saving the HTML code of the profile # Storing Answers in not feasible. #filename = str(answer.id) + '.html' #with open(os.path.join(ANSWERS_FOLDER, filename), 'w+') as fstream: # fstream.write(resp) sys.stdout.write('\rDone Parsing Answer id %d (%d)' % (answer.id, len(users))) sys.stdout.flush()
def parse_answer(answer): # Fetching answer resp = get_page(ANSWER_URL.format(answer.question, answer.writer_uname)) doc = BeautifulSoup(resp, 'html.parser', parse_only=ANSWER_STRAIN) # Get Credible Users who have upvoted this answer users = doc.find('div', class_=CREDIBILITY_CLASS).find_all('a', class_='user') for user in users: Profile.create_or_get(uname=user['href'].split('/')[2], name=user.string) # Update answer stats answer.views = int(doc.find('div', class_=VIEW_ROW_CLASS).strong.string .replace(',', '')) answer.upvotes = int(doc.find('a', class_=UPVOTE_ROW_CLASS).strong.string .replace(',', '')) answer.last_parsed = datetime.datetime.now() answer.save() # Saving the HTML code of the profile filename = str(answer.id) + '.html' with open(os.path.join(ANSWERS_FOLDER, filename), 'w+') as fstream: fstream.write(resp) sys.stdout.write('\rDone Parsing Answer id %d (%d)' % (answer.id, len(users))) sys.stdout.flush()
action='store_true', help='Do not Crawl Profiles') parser.add_argument('--no_answer', action='store_true', help='Do not Crawl Answers') args = parser.parse_args() # Filling Database with Top Writers 2016 with open('top_writers_2016.json', 'r') as fstream: writer_list = json.load(fstream) with open('other_writers.json', 'r') as fstream: writer_list += json.load(fstream) create_directory(ANSWERS_FOLDER) create_directory(PROFILE_FOLDER) for writer in writer_list: new = Profile.create_or_get(uname=writer['uname'], name=writer['name'])[1] if new: print(u'New Profile %s Created' % writer['uname']) #print "Number of Writers Added = ", len(writer_list) # Starting to Crawl total_parsing = 0 max_crawl = args.max_crawl while total_parsing < max_crawl: if not args.no_profile: # Parse Old Profiles old_time = datetime.datetime.now() - datetime.timedelta(days=7) old_profiles = Profile.select().where( Profile.last_parsed <= old_time).limit(max_crawl - total_parsing) total_parsing += len(old_profiles) print "Number of Profiles to Crawl - ", len(old_profiles)
help='Number of maximum requests to make') parser.add_argument('--no_profile', action='store_true', help='Do not Crawl Profiles') parser.add_argument('--no_answer', action='store_true', help='Do not Crawl Answers') args = parser.parse_args() # Filling Database with Top Writers 2016 with open('top_writers_2016.json', 'r') as fstream: writer_list = json.load(fstream) with open('other_writers.json', 'r') as fstream: writer_list += json.load(fstream) create_directory(ANSWERS_FOLDER) create_directory(PROFILE_FOLDER) for writer in writer_list: new = Profile.create_or_get(uname=writer['uname'], name=writer['name'])[1] if new: print(u'New Profile %s Created' % writer['uname']) #print "Number of Writers Added = ", len(writer_list) # Starting to Crawl total_parsing = 0 max_crawl = args.max_crawl while total_parsing < max_crawl: if not args.no_profile: # Parse Old Profiles old_time = datetime.datetime.now() - datetime.timedelta(days=7) old_profiles = Profile.select().where( Profile.last_parsed <= old_time).limit(max_crawl - total_parsing) total_parsing += len(old_profiles) print "Number of Profiles to Crawl - ", len(old_profiles) for profile in old_profiles: