def process_users(self, users_ids): if users_ids is None: return for user_id in users_ids: if user_id in self.users_blacklist: continue user = self.session.query(People).filter(People.username == user_id).first() if user is not None: continue url = self.url + "/users/" + user_id + ".json" logging.info("Getting user " + user_id) logging.info(url) stream = requests.get(url, verify=False) try: parser = JSONParser(unicode(stream.text)) parser.parse() except: logging.error("Can't get " + user_id + " data") self.users_blacklist.append(user_id) # print unicode(stream.text) continue user = parser.data['user'] dbuser = People() dbuser.username = user['username'] dbuser.reputation = user['trust_level'] dbuser.avatar = user['uploaded_avatar_id'] dbuser.last_seen_at = user['last_posted_at'] dbuser.joined_at = user['created_at'] dbuser.identifier = user['id'] self.session.add(dbuser) self.total_users += 1 self.session.commit() return
def categories(self): stream = requests.get(self.url + "/categories.json", verify=False) logging.info(stream.url) #print(self.url + "/api/v1/users/" + str(user_id) + "/") parser = JSONParser(unicode(stream.text)) parser.parse() categories = parser.data['category_list']['categories'] return categories
def process_comments(self, dbpost_ids, kind='question'): # coments associated to a post (question or answer) that question if kind == 'question': base_url = self.url + '/2.2/questions/' if kind == 'answer': base_url = self.url + '/2.2/answers/' base_url += str(dbpost_ids) + '/comments?' base_url += 'order=desc&sort=creation&site=stackoverflow&key=' + self.api_key base_url += '&' + 'pagesize=' + str(self.pagesize) logging.debug("Getting comments for " + str(dbpost_ids)) has_more = True page = 1 while has_more: url = base_url + "&page=" + str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.comments has_more = False parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] if 'has_more' not in parser.data: logging.error("No has_more in JSON response. Exiting.") print parser.data raise has_more = parser.data['has_more'] page += 1 if 'items' in parser.data: data = parser.data['items'] else: logging.error("No items in comments") logging.error(parser.data) return for comment in data: dbcomment = Comments() # question or answer identifier if kind == "question": dbcomment.question_identifier = comment['post_id'] if kind == "answer": dbcomment.answer_identifier = comment['post_id'] if 'body' in comment.keys(): dbcomment.body = comment['body'] if 'user_id' in comment['owner']: dbcomment.user_identifier = comment['owner']['user_id'] if dbcomment.user_identifier not in self.user_ids_comments: self.user_ids_comments.append( dbcomment.user_identifier) cdate = datetime.datetime.fromtimestamp( int(comment['creation_date'])) dbcomment.submitted_on = cdate.strftime('%Y-%m-%d %H:%M:%S') self.session.add(dbcomment) self.total_comments += 1 self.session.commit()
def process_answers(self, dbquestion_ids): """ Get all answers for the list of question ids """ has_more = True page = 1 base_url = self.url + '/2.2/questions/' + str( dbquestion_ids) + '/answers?' base_url += 'order=desc&sort=activity&site=stackoverflow&key=' + self.api_key base_url += '&' + 'pagesize=' + str(self.pagesize) logging.debug("Getting answers for dbquestion ids" + str(dbquestion_ids)) dbanswers_ids = [] while has_more: url = base_url + "&page=" + str(page) if not self.debug: data = self._get_api_data(url) else: has_more = False data = StackSampleData.answers parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] has_more = parser.data['has_more'] page += 1 data = parser.data['items'] for answer in data: dbanswer = Answers() dbanswer.identifier = answer['answer_id'] dbanswers_ids.append(dbanswer.identifier) # dbanswer.body = text if 'user_id' in answer['owner']: dbanswer.user_identifier = answer['owner']['user_id'] if dbanswer.user_identifier not in self.user_ids_answers: self.user_ids_answers.append(dbanswer.user_identifier) dbanswer.question_identifier = answer['question_id'] create_date = datetime.datetime.fromtimestamp( int(answer['creation_date'])) dbanswer.submitted_on = create_date.strftime( '%Y-%m-%d %H:%M:%S') dbanswer.votes = answer['score'] self.session.add(dbanswer) self.total_answers += 1 self.user_ids_answers.append(dbanswer.user_identifier) self.session.commit() # Time to get comments for all answers while len(dbanswers_ids) > 0: ids = [] for i in range(self.pagesize): if len(dbanswers_ids) > 0: val = dbanswers_ids.pop() if val is not None: ids.append(val) else: logging.info("Found None Answer") ids = ";".join([str(x) for x in ids]) self.process_comments(ids, 'answer')
def process_comments(self, dbpost_ids, kind = 'question'): # coments associated to a post (question or answer) that question if kind == 'question': base_url = self.url + '/2.2/questions/' if kind == 'answer': base_url = self.url + '/2.2/answers/' base_url += str(dbpost_ids) +'/comments?' base_url += 'order=desc&sort=creation&site=stackoverflow&key='+self.api_key base_url += '&' + 'pagesize='+str(self.pagesize) logging.debug("Getting comments for " + str(dbpost_ids)) has_more = True page = 1 while has_more: url = base_url + "&page="+str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.comments has_more = False parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] if 'has_more' not in parser.data: logging.error("No has_more in JSON response. Exiting.") print parser.data raise has_more = parser.data['has_more'] page += 1 if 'items' in parser.data: data = parser.data['items'] else: logging.error("No items in comments") logging.error(parser.data) return for comment in data: dbcomment = Comments() # question or answer identifier if kind == "question": dbcomment.question_identifier = comment['post_id'] if kind == "answer": dbcomment.answer_identifier = comment['post_id'] if 'body' in comment.keys(): dbcomment.body = comment['body'] if 'user_id' in comment['owner']: dbcomment.user_identifier = comment['owner']['user_id'] if dbcomment.user_identifier not in self.user_ids_comments: self.user_ids_comments.append(dbcomment.user_identifier) cdate = datetime.datetime.fromtimestamp(int(comment['creation_date'])) dbcomment.submitted_on = cdate.strftime('%Y-%m-%d %H:%M:%S') self.session.add(dbcomment) self.total_comments += 1 self.session.commit()
def process_answers(self, dbquestion_ids): """ Get all answers for the list of question ids """ has_more = True page = 1 base_url = self.url + '/2.2/questions/'+str(dbquestion_ids)+'/answers?' base_url += 'order=desc&sort=activity&site=stackoverflow&key='+self.api_key base_url += '&' + 'pagesize='+str(self.pagesize) logging.debug("Getting answers for dbquestion ids" + str(dbquestion_ids)) dbanswers_ids = [] while has_more: url = base_url + "&page="+str(page) if not self.debug: data = self._get_api_data(url) else: has_more = False data = StackSampleData.answers parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] has_more = parser.data['has_more'] page += 1 data = parser.data['items'] for answer in data: dbanswer = Answers() dbanswer.identifier = answer['answer_id'] dbanswers_ids.append(dbanswer.identifier) # dbanswer.body = text if 'user_id' in answer['owner']: dbanswer.user_identifier = answer['owner']['user_id'] if dbanswer.user_identifier not in self.user_ids_answers: self.user_ids_answers.append(dbanswer.user_identifier) dbanswer.question_identifier = answer['question_id'] create_date = datetime.datetime.fromtimestamp(int(answer['creation_date'])) dbanswer.submitted_on = create_date.strftime('%Y-%m-%d %H:%M:%S') dbanswer.votes = answer['score'] self.session.add(dbanswer) self.total_answers += 1 self.user_ids_answers.append(dbanswer.user_identifier) self.session.commit() # Time to get comments for all answers while len(dbanswers_ids)>0: ids = [] for i in range(self.pagesize): if len(dbanswers_ids)>0: val = dbanswers_ids.pop() if val is not None: ids.append(val) else: logging.info("Found None Answer") ids = ";".join([str(x) for x in ids]) self.process_comments(ids,'answer')
def process_users(self, users_ids): if users_ids is None: return if len(users_ids.split(";")) > self.pagesize: logging.error("Max ids overcome in process_users " + users_ids) raise Exception base_url = self.url + '/2.2/users/' + str(users_ids) + '?' base_url += 'order=desc&sort=reputation&site=stackoverflow&key=' + self.api_key base_url += '&' + 'pagesize=' + str(self.pagesize) has_more = True page = 1 while has_more: url = base_url + "&page=" + str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.users has_more = False parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] if 'has_more' not in parser.data: logging.error("No has_more in JSON response") print parser.data raise has_more = parser.data['has_more'] data = parser.data['items'] for user in data: dbuser = People() dbuser.username = user['display_name'] dbuser.reputation = user['reputation'] if 'profile_image' in user: dbuser.avatar = user['profile_image'] dbuser.last_seen_at = datetime.datetime.fromtimestamp( int(user['last_access_date'])).strftime( '%Y-%m-%d %H:%M:%S') dbuser.joined_at = datetime.datetime.fromtimestamp( int(user['creation_date'])).strftime('%Y-%m-%d %H:%M:%S') dbuser.identifier = user['user_id'] self.session.add(dbuser) self.session.commit() return
def get_search_tags(self): found_tags = [] logging.info("Getting all tags based on: " + self.tags) url = self.url + "/2.2/tags?key="+self.api_key+"&" url += "order=desc&sort=popular&site=stackoverflow" url += "&inname=" + str(self.tags) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.tags parser = JSONParser(unicode(data)) parser.parse() tags_data = parser.data['items'] for tag in tags_data: found_tags.append(tag['name']) logging.info(found_tags) return found_tags
def get_search_tags(self): found_tags = [] logging.info("Getting all tags based on: " + self.tags) url = self.url + "/2.2/tags?key=" + self.api_key + "&" url += "order=desc&sort=popular&site=stackoverflow" url += "&inname=" + str(self.tags) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.tags parser = JSONParser(unicode(data)) parser.parse() tags_data = parser.data['items'] for tag in tags_data: found_tags.append(tag['name']) logging.info(found_tags) return found_tags
def process_users(self, users_ids): if users_ids is None: return if len(users_ids.split(";"))>self.pagesize: logging.error("Max ids overcome in process_users " + users_ids) raise Exception base_url = self.url + '/2.2/users/'+str(users_ids)+'?' base_url += 'order=desc&sort=reputation&site=stackoverflow&key='+self.api_key base_url += '&' + 'pagesize='+str(self.pagesize) has_more = True page = 1 while has_more: url = base_url + "&page="+str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.users has_more = False parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] if 'has_more' not in parser.data: logging.error("No has_more in JSON response") print parser.data raise has_more = parser.data['has_more'] data = parser.data['items'] for user in data: dbuser = People() dbuser.username = user['display_name'] dbuser.reputation = user['reputation'] if 'profile_image' in user: dbuser.avatar = user['profile_image'] dbuser.last_seen_at = datetime.datetime.fromtimestamp(int(user['last_access_date'])).strftime('%Y-%m-%d %H:%M:%S') dbuser.joined_at = datetime.datetime.fromtimestamp(int(user['creation_date'])).strftime('%Y-%m-%d %H:%M:%S') dbuser.identifier = user['user_id'] self.session.add(dbuser) self.session.commit() return
def process_users(self, users_ids): if users_ids is None: return for user_id in users_ids: if user_id in self.users_blacklist: continue user = self.session.query(People).filter( People.username == user_id).first() if user is not None: continue url = self.url + "/users/" + user_id + ".json" logging.info("Getting user " + user_id) logging.info(url) stream = requests.get(url, verify=False) try: parser = JSONParser(unicode(stream.text)) parser.parse() except: logging.error("Can't get " + user_id + " data") self.users_blacklist.append(user_id) # print unicode(stream.text) continue user = parser.data['user'] dbuser = People() dbuser.username = user['username'] dbuser.reputation = user['trust_level'] dbuser.avatar = user['uploaded_avatar_id'] dbuser.last_seen_at = user['last_posted_at'] dbuser.joined_at = user['created_at'] dbuser.identifier = user['id'] self.session.add(dbuser) self.total_users += 1 self.session.commit() return
def process_answers(self, question_slug): """ Get all answers for the question with slug dbquestion_slug """ def process_answer(answer): dbanswer = Answers() dbanswer.identifier = answer['id'] # dbanswer.body = text dbanswer.user_identifier = answer['user_id'] if answer['username'] not in self.user_ids_answers: self.user_ids_answers.append(answer['username']) dbanswer.question_identifier = question_id dbanswer.submitted_on = answer['updated_at'] dbanswer.votes = answer['score'] dbanswer.body = answer['cooked'] self.session.add(dbanswer) self.total_answers += 1 url = self.url + "/t/" + question_slug + ".json" logging.info("Getting answers for " + question_slug) logging.info(url) stream = requests.get(url, verify=False) parser = JSONParser(unicode(stream.text)) try: parser.parse() except: logging.error("Cant parse answers for question " + question_slug) print unicode(stream.text) return data = parser.data question_id = parser.data['id'] data = data['post_stream']['posts'] for answer in data: process_answer(answer) self.session.commit() self.process_users(self.user_ids_answers) # It there are more than 20 answers we need to retrieve the rest discoure_max_answers_query = 20 if len(parser.data['post_stream']['stream']) > 20: pending = parser.data['post_stream']['stream'] for i in range(0,discoure_max_answers_query): pending.pop(0) url = self.url + "/t/"+ str(question_id) + "/posts.json?" for answer_id in pending: url += "post_ids%5B%5D="+str(answer_id)+"&" stream = requests.get(url, verify=False) parser = JSONParser(unicode(stream.text)) try: parser.parse() except: logging.error("Cant parse additional answers for question " + question_slug) logging.error(url) print unicode(stream.text) return data = parser.data data = data['post_stream']['posts'] for answer in data: process_answer(answer) self.session.commit() self.process_users(self.user_ids_answers)
def process_questions(self, category): logging.debug("Processing questions for " + category) def update_users(users): for user in users: if user['username'] not in self.user_ids_questions: self.user_ids_questions.append(user['username']) def process_question(question): dbquestion = Questions() dbquestion.author_identifier = question['posters'][0]['user_id'] dbquestion.answer_count = question['reply_count'] dbquestion.question_identifier = question['id'] dbquestion.view_count = question['views'] if question['last_posted_at'] is not None: dbquestion.last_activity_at = question['last_posted_at'] else: dbquestion.last_activity_at = question['created_at'] dbquestion.title = question['title'] dbquestion.url = question['slug'] dbquestion.added_at = question['created_at'] dbquestion.score = question['like_count'] # dbquestion.last_activity_by = question['last_poster_username'] dbquestion.body = None if 'excerpt' in question: dbquestion.body = question['excerpt'] # Additional fields in Discourse: liked,pinned_globally, visible, highest_post_number, unseen,posts_count # bumped_at, bookmarked, archived,archetype,has_summary,pinned,image_url,closed,unpinned,bumped, fancy_title if self.question_new_or_changed(dbquestion): # Question is new or changed self.session.add(dbquestion) self.session.commit() self.process_answers(question['slug']) self.process_dbquestiontags(dbquestion.question_identifier, category) update_users = False self.total_questions += 1 url = self.url + "/c/" + category + ".json" stream = requests.get(url, verify=False) print url parser = JSONParser(unicode(stream.text)) parser.parse() data = parser.data data = data['topic_list']['topics'] for question in data: process_question(question) update_users(parser.data['users']) self.process_users(self.user_ids_questions) while 'more_topics_url' in parser.data['topic_list']: url = self.url + parser.data['topic_list']['more_topics_url'] print url stream = requests.get(url, verify=False) parser = JSONParser(unicode(stream.text)) parser.parse() data = parser.data data = data['topic_list']['topics'] for question in data: process_question(question) if 'users' in parser.data: update_users(parser.data['users']) self.process_users(self.user_ids_questions) else: logging.info("Questions without users") print(parser.data) return
def process_answers(self, question_slug): """ Get all answers for the question with slug dbquestion_slug """ def process_answer(answer): dbanswer = Answers() dbanswer.identifier = answer['id'] # dbanswer.body = text dbanswer.user_identifier = answer['user_id'] if answer['username'] not in self.user_ids_answers: self.user_ids_answers.append(answer['username']) dbanswer.question_identifier = question_id dbanswer.submitted_on = answer['updated_at'] dbanswer.votes = answer['score'] dbanswer.body = answer['cooked'] self.session.add(dbanswer) self.total_answers += 1 url = self.url + "/t/" + question_slug + ".json" logging.info("Getting answers for " + question_slug) logging.info(url) stream = requests.get(url, verify=False) parser = JSONParser(unicode(stream.text)) try: parser.parse() except: logging.error("Cant parse answers for question " + question_slug) print unicode(stream.text) return data = parser.data question_id = parser.data['id'] data = data['post_stream']['posts'] for answer in data: process_answer(answer) self.session.commit() self.process_users(self.user_ids_answers) # It there are more than 20 answers we need to retrieve the rest discoure_max_answers_query = 20 if len(parser.data['post_stream']['stream']) > 20: pending = parser.data['post_stream']['stream'] for i in range(0, discoure_max_answers_query): pending.pop(0) url = self.url + "/t/" + str(question_id) + "/posts.json?" for answer_id in pending: url += "post_ids%5B%5D=" + str(answer_id) + "&" stream = requests.get(url, verify=False) parser = JSONParser(unicode(stream.text)) try: parser.parse() except: logging.error("Cant parse additional answers for question " + question_slug) logging.error(url) print unicode(stream.text) return data = parser.data data = data['post_stream']['posts'] for answer in data: process_answer(answer) self.session.commit() self.process_users(self.user_ids_answers)
def process_questions(self, tag): logging.debug("Processing questions for " + tag) has_more = True base_url = self.url + '/2.2/questions?' base_url += 'order=desc&sort=activity&site=stackoverflow&key=' + self.api_key + '&' base_url += 'tagged=' + tag # get total number of questions url_total = base_url + '&' + 'pagesize=1&filter=total' data = self._get_api_data(url_total) # Hack: total not provided in API as a JSON object data = json.loads(data) total = data['total'] logging.info('Total number of questions to download: ' + str(total)) page = 1 done = 0 while has_more: questions_ids = [] # used to get answers and comments url = base_url + '&' + 'pagesize=' + str( self.pagesize) + '&' + 'page=' + str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.questions parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] data = parser.data['items'] has_more = parser.data['has_more'] if self.debug: has_more = False page += 1 for question in data: # Each of the question is initialized here # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date', # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id'] dbquestion = Questions() if 'user_id' in question['owner']: dbquestion.author_identifier = question['owner']['user_id'] dbquestion.answer_count = question['answer_count'] dbquestion.question_identifier = question['question_id'] dbquestion.view_count = question['view_count'] if question['last_activity_date'] is not None: dbquestion.last_activity_at = datetime.datetime.fromtimestamp( int(question['last_activity_date'])).strftime( '%Y-%m-%d %H:%M:%S') else: dbquestion.last_activity_at = datetime.datetime.fromtimestamp( int(question['creation_date'])).strftime( '%Y-%m-%d %H:%M:%S') dbquestion.title = question['title'] dbquestion.url = question['link'] dbquestion.added_at = datetime.datetime.fromtimestamp( int(question['creation_date'])).strftime( '%Y-%m-%d %H:%M:%S') dbquestion.score = question['score'] # Missing fields in Stack dbquestion.last_activity_by = None dbquestion.body = None # TODO: we need to get it # Additional fields in Stack: is_answered, accepted_answer_id if self.question_new_or_changed(dbquestion): # Question is new or changed self.session.add(dbquestion) self.session.commit() self.process_dbquestiontags(dbquestion.question_identifier, question['tags']) questions_ids.append(question['question_id']) if dbquestion.author_identifier: if dbquestion.author_identifier not in self.user_ids_questions: self.user_ids_questions.append( dbquestion.author_identifier) self.total_questions += 1 done += 1 if self.total_questions % 10 == 0: logging.info("Done: " + str(done) + "/" + str(total)) logging.info("Done: " + str(done) + "/" + str(total)) ids = ";".join([str(x) for x in questions_ids]) if len(ids) > 0: # Get all answers for the pagesize questions updated self.process_answers(ids) # Get all comments for the pagesize questions updated self.process_comments(ids) return
def process_questions(self, category): logging.debug("Processing questions for " + category) def update_users(users): for user in users: if user['username'] not in self.user_ids_questions: self.user_ids_questions.append(user['username']) def process_question(question): dbquestion = Questions() dbquestion.author_identifier = question['posters'][0]['user_id'] dbquestion.answer_count = question['reply_count'] dbquestion.question_identifier = question['id'] dbquestion.view_count = question['views'] if question['last_posted_at'] is not None: dbquestion.last_activity_at = question['last_posted_at'] else: dbquestion.last_activity_at = question['created_at'] dbquestion.title = question['title'] dbquestion.url = question['slug'] dbquestion.added_at = question['created_at'] dbquestion.score = question['like_count'] # dbquestion.last_activity_by = question['last_poster_username'] dbquestion.body = None if 'excerpt' in question: dbquestion.body = question['excerpt'] # Additional fields in Discourse: liked,pinned_globally, visible, highest_post_number, unseen,posts_count # bumped_at, bookmarked, archived,archetype,has_summary,pinned,image_url,closed,unpinned,bumped, fancy_title if self.question_new_or_changed(dbquestion): # Question is new or changed self.session.add(dbquestion) self.session.commit() self.process_answers(question['slug']) self.process_dbquestiontags(dbquestion.question_identifier, category) update_users = False self.total_questions += 1 url = self.url + "/c/" + category + ".json" stream = requests.get(url, verify=False) print url parser = JSONParser(unicode(stream.text)) parser.parse() data = parser.data data = data['topic_list']['topics'] for question in data: process_question(question) update_users(parser.data['users']) self.process_users(self.user_ids_questions) while 'more_topics_url' in parser.data['topic_list']: url = self.url + parser.data['topic_list']['more_topics_url'] print url stream = requests.get(url, verify=False) parser = JSONParser(unicode(stream.text)) parser.parse() data = parser.data data = data['topic_list']['topics'] for question in data: process_question(question) if 'users' in parser.data: update_users(parser.data['users']) self.process_users(self.user_ids_questions) else: logging.info("Questions without users") print (parser.data) return
def process_questions(self, tag): logging.debug("Processing questions for " + tag) has_more = True base_url = self.url + '/2.2/questions?' base_url += 'order=desc&sort=activity&site=stackoverflow&key='+self.api_key+'&' base_url += 'tagged='+ tag # get total number of questions url_total = base_url +'&'+'pagesize=1&filter=total' data = self._get_api_data(url_total) # Hack: total not provided in API as a JSON object data = json.loads(data) total = data['total'] logging.info('Total number of questions to download: ' + str(total)) page = 1 done = 0 while has_more: questions_ids = [] # used to get answers and comments url = base_url + '&' + 'pagesize='+str(self.pagesize)+'&'+'page='+str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.questions parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] data = parser.data['items'] has_more = parser.data['has_more'] if self.debug: has_more = False page += 1 for question in data: # Each of the question is initialized here # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date', # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id'] dbquestion = Questions() if 'user_id' in question['owner']: dbquestion.author_identifier = question['owner']['user_id'] dbquestion.answer_count = question['answer_count'] dbquestion.question_identifier = question['question_id'] dbquestion.view_count = question['view_count'] if question['last_activity_date'] is not None: dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['last_activity_date'])).strftime('%Y-%m-%d %H:%M:%S') else: dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S') dbquestion.title = question['title'] dbquestion.url = question['link'] dbquestion.added_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S') dbquestion.score = question['score'] # Missing fields in Stack dbquestion.last_activity_by = None dbquestion.body = None # TODO: we need to get it # Additional fields in Stack: is_answered, accepted_answer_id if self.question_new_or_changed(dbquestion): # Question is new or changed self.session.add(dbquestion) self.session.commit() self.process_dbquestiontags(dbquestion.question_identifier, question['tags']) questions_ids.append(question['question_id']) if dbquestion.author_identifier: if dbquestion.author_identifier not in self.user_ids_questions: self.user_ids_questions.append(dbquestion.author_identifier) self.total_questions += 1 done +=1 if self.total_questions % 10 == 0: logging.info("Done: " + str(done) + "/"+str(total)) logging.info("Done: " + str(done) + "/"+str(total)) ids = ";".join([str(x) for x in questions_ids]) if len(ids)>0: # Get all answers for the pagesize questions updated self.process_answers(ids) # Get all comments for the pagesize questions updated self.process_comments(ids) return