def crawl_plat_first_letter(shuju_date="2020-01-062020-01-12"): """ 平台成交数据 https://shuju.wdzj.com/platdata-1.html """ url = "https://shuju.wdzj.com/plat-data-custom.html" form_data = { "type": 0, "shujuDate": shuju_date } response = requests.post(url, data=form_data) status = response.status_code if status != 200: print("crawl failed. (status is not 200)") raise CrawlFailed('crawl failed') plats_data = response.json() for plat_data in plats_data: plat_id = plat_data.get('platId') wdzj_id = plat_data.get('wdzjPlatId') first_letter = plat_data.get('firstLetter') session = DBSession() if wdzj_id != 0: product = session.query(Product).filter_by(plat_id=plat_id).first() product.first_letter = first_letter session.commit() session.close()
class BaseHandler(RequestHandler, ValidationMixin): def prepare(self): self.db = DBSession() def on_finish(self): self.db.close() def get_current_user(self): pass @property def host(self): if options.env == config.ENV_LOCAL: host = netifaces.ifaddresses( 'en0')[2][0]['addr'] + ":" + options.port else: host = self.request.host return self.request.protocol + "://" + host def load_json(self): """Load JSON from the request body and store them in self.request.arguments, like Tornado does by default for POSTed form parameters. If JSON cannot be decoded, raises an HTTPError with status 400. """ try: self.request.arguments = json.loads(self.request.body) logging.info("request arguments:{}".format(self.request.arguments)) except: raise HTTPError(400, "Problems parsing JSON")
def create_blog_api(): user = g.get('user', None) if user is None: return redirect(url_for('login')) else: if user.admin: name = request.form['name'].encode('utf8') summary = request.form['summary'].encode('utf8') content = request.form['content'].encode('utf8') user_id = request.form['user_id'].encode('utf8') user_name = request.form['user_name'].encode('utf8') user_image = request.form['user_image'].encode('utf8') sess = DBSession() blog = Blogs(user_id=user_id, user_name=user_name, user_image=user_image, name=name, summary=summary, content=content) sess.add(blog) sess.commit() sess.close() return 'ok' else: return redirect(url_for('login'))
def _cleanup(self): now = time() session = DBSession() session.query(Data).filter( now - Data.timestamp > TIMEOUT_THRESHOLD).delete() session.commit() session.close()
def crawl_plat_overview(first_letter): # url = "https://www.wdzj.com/dangan/pjs/" url = "https://www.wdzj.com/dangan/{first_letter}/".format( first_letter=first_letter ) print("crawl plat {}".format(first_letter)) response = requests.get(url, headers=HEADERS) if response.status_code != 200: print('crawl failed: code: {}, url: {}'.format(response.status_code, url)) return # raise CrawlFailed('crawl failed!') encode_response(response) html = etree.HTML(response.text) try: plat_name = html.xpath("//div[@class='title']/h1|h2")[0].text print("plat name: {}".format(plat_name)) # 注册资金(实缴资金) 银行存管 投标保障 # box = html.xpath("//div[@class='zzfwbox'] | //div[@class='bgbox-bt zzfwbox']") try: zczj = html.xpath("//div[@class='zzfwbox']/dl[1]/dd[1]//div[@class='r']")[0].text.strip().split() except IndexError: zczj = html.xpath("// div[ @class ='bgbox-bt zzfwbox'] // dl[1] / dd[1] // div[@ class ='r']")[0].text.strip().split() # // div[ @class ='bgbox-bt zzfwbox'] // dl[1] / dd[1] // div[@ class ='r'] if len(zczj) == 2: zczj_value, sjzj = zczj sjzj_value = sjzj.strip('()').split(':')[1] else: zczj_value = zczj sjzj_value = '-' try: yhcg_value = html.xpath("//div[@class='zzfwbox']/dl[1]/dd[2]//div[@class='r']")[0].text.strip() except IndexError: yhcg_value = html.xpath("//div[ @class ='bgbox-bt zzfwbox']//dl[1]/dd[2]//div[@class='r']")[0].text.strip() try: tbbz_value = html.xpath("//div[@class='zzfwbox']/dl[2]/dd[3]//div[@class='r']")[0].text.strip() except IndexError: tbbz_value = html.xpath("//div[ @class ='bgbox-bt zzfwbox']//dl[2]/dd[3]//div[@class='r']")[0].text.strip() plat_overview = dict( plat_name=plat_name, zhucezijin=zczj_value, shijiaozijin=sjzj_value, yinhangcunguan=yhcg_value, toubiaobaozhang=tbbz_value ) except AttributeError as ex: print('crawl failed: ex: {}, url: {}'.format(str(ex), url)) raise except IndexError as ex: print('crawl failed: ex: {}, url: {}'.format(str(ex), url)) raise new_plat_overview = PlatOverview(**plat_overview) session = DBSession() session.add(new_plat_overview) session.commit() session.close()
def emailExists(email): res = False mySession = DBSession() result = mySession.query(User).filter_by(user_email = email).first() if not result is None: res = True mySession.close() return res
def getUserPassword(user): res = "" mySession = DBSession() result = mySession.query(User).filter_by(user_name = user).first() if not result is None: res = decodeData(result.user_password) mySession.close() return res
def otherUserHasEmail(user,email): res = False mySession = DBSession() result = mySession.query(User).filter(User.user_name != user).filter_by(user_email = email).first() if not result is None: res = True mySession.close() return res
def userExists(user): res = False mySession = DBSession() result = mySession.query(User).filter_by(user_name = user).first() if not result is None: res = True mySession.close() return res
def crawl_all_plats_info(): """爬取所有平台的详细信息""" # 1. 获取所有平台的url # 2. 分别爬取 session = DBSession() for product in session.query(Product).all(): crawl_plat_info(product.wdzj_id) session.close()
def getSectorList(): sectors = [] mySession = DBSession() results = mySession.query(Lkpsector).all() for result in results: sectors.append({"code": str(result.sector_cod), "name": result.sector_name}) mySession.close() return sectors
def getUserWithKey(key): res = "" mySession = DBSession() result = mySession.query(User).filter_by(user_apikey = key).first() if not result is None: res = result.user_name mySession.close() return res
def crawl_all_plats_overview(): """爬取所有平台的概览信息""" # 1. 获取所有平台的url # 2. 分别爬取 session = DBSession() for product in session.query(Product).all(): crawl_plat_overview(product.first_letter) session.close()
def getCountryName(cnty_cod): res = "" mySession = DBSession() result = mySession.query(Lkpcountry).filter_by(cnty_cod = cnty_cod).first() if not result is None: res = result.cnty_name mySession.close() return res
def getSectorName(sector_cod): res = "" mySession = DBSession() result = mySession.query(Lkpsector).filter_by(sector_cod = sector_cod).first() if not result is None: res = result.sector_name mySession.close() return res
def getAPIKey(self): key = "" mySession = DBSession() result = mySession.query(userModel).filter_by(user_name = self.login).first() if not result is None: key = result.user_apikey mySession.close() return key
def getUserData(user): res = None mySession = DBSession() result = mySession.query(userModel).filter_by(user_name = user).filter_by(user_active = 1).first() if not result is None: res = User(result.user_name,"",result.user_fullname,result.user_organization,result.user_email,result.user_cnty,result.user_sector,result.user_about) mySession.close() return res
def getAPILog(requestID): res = "" mySession = DBSession() result = mySession.query(Apilog).filter_by(log_uuid = requestID).first() if not result is None: res = result.log_id mySession.close() return res
def get_patient_id(mac_address): content = request.get_json(silent=True) session = DBSession() patient = session.query(Patient).filter_by( mac_address=mac_address.upper()).first() id = patient.id if patient is not None else None session.close() return jsonify(id)
def save_data(self, data): session = DBSession() temp = data.copy() temp['timestamp'] = time() new_record = Data(**temp) session.add(new_record) session.commit() session.close()
def getAPIInfo(logID): res = {} mySession = DBSession() result = mySession.query(Apilog).filter_by(log_id = logID).first() if not result is None: res["log_datetime"] = result.log_datetime res["log_ip"] = result.log_ip mySession.close() return res
def update_patient(id): content = request.get_json(silent=True) session = DBSession() patient = session.query(Patient).filter_by(id=id).one() for key, value in content.iteritems(): setattr(patient, key, value) stmt = session.add(patient) session.commit() session.close() return jsonify()
def fetch_action(cls, id): try: session = DBSession() action = session.query(Action).filter(Action.id == id).first() session.close() except IntegrityError as error: logging.error(error) return {} return action.to_dict()
def addToLog(log_user,log_type,log_message): mySession = DBSession() newLog = Activitylog(log_user,log_type,log_message) try: transaction.begin() mySession.add(newLog) #Add the new log to MySQL transaction.commit() mySession.close() except: transaction.abort() mySession.close()
def before_req(): cookie = request.cookies.get('huusession') if cookie: user_id = cookie.split('-')[0] if user_id: try: sess = DBSession() user = sess.query(Users).filter(Users.id == user_id).one() sess.close() g.user = user except: g.user = None
def getCountryList(): countries = [] mySession = DBSession() results = mySession.query(Lkpcountry).all() for result in results: try: name = unicode(result.cnty_name.decode("cp1252").encode("utf-8")) countries.append({"code": result.cnty_cod, "name": name}) except: countries.append({"code": result.cnty_cod, "name": "Unknown"}) mySession.close() return countries
def getStats(currUser = None): data = {} mySession = DBSession() data["totUsers"] = mySession.query(User).count() if currUser == None: data["totFeeds"] = 0 data["totModels"] = 0 else: data["totFeeds"] = 0 data["totModels"] = 0 mySession.close() return data
def changeUserPassword(user,password): mySession = DBSession() try: transaction.begin() mySession.query(User).filter_by(user_name = user).update({"user_password": encodeData(password)}) transaction.commit() mySession.close() return True except: transaction.abort() mySession.close() return False
def update_action_exist(cls, id, url): try: session = DBSession() actions = session.query(Action).filter(Action.url == url).filter( Action.id != id).all() session.close() if actions: return True except IntegrityError as error: logging.error(error) return False return False
def addAPILog(ipaddress,user,requestID,inputData): mySession = DBSession() newApilog = Apilog(ipaddress,user,requestID,inputData) try: transaction.begin() mySession.add(newApilog) transaction.commit() mySession.close() return True except: transaction.abort() mySession.close() return False
def start_shuffle(): random.seed() session = DBSession() groups = session.query(Group).filter( Group.date_shuffle <= datetime.datetime.now().date(), Group.shuffle_done == False, Group.active == True).all() for group in groups: if len(group.members) == 0: group.shuffle_done = True session.commit() send_message( group.owner.telegramid, 'К сожалению, в вашей группе "{}" нет ни одного участника. Некому высылать подарки' .format(group.name)) elif len(group.members) == 1: group.shuffle_done = True session.commit() send_message( group.owner.telegramid, 'К сожалению в вашей группе "{}" всего один учатник. Некому высылать подарки.' .format(group.name)) send_message( group.members[0].user.telegramid, 'Вы единственный участник группы {}. Подарите себе что-нибудь приятное' .format(group.name)) elif len(group.members) > 1: member_list = group.members[:] random.shuffle(member_list) member_list[-1].send_to = member_list[0].user for i in range(len(member_list) - 1): member_list[i].send_to = member_list[i + 1].user group.shuffle_done = True session.commit() for member in group.members: to_member = session.query(Member).filter( Member.group == group, Member.user == member.send_to).first() text = '''Распределение получателей для группу {} завершено!\nВы Санта для {}. Пожелания к подарку: {} подарок высылать по следующему адресу: {} {}. На имя {}'''.format( group.name, to_member.user.name, to_member.suggestions, to_member.user.index, to_member.user.address, to_member.user.fio) send_message(member.user.telegramid, text) send_message( group.owner.telegramid, 'Распределение получателей для группу {} завершено! Участников: {}.\ Всем участникам разосланы их получатели.'.format( group.name, len(group.members))) session.close() save_to_log('system', comment_text="Shuffle done")
def add(cls, name, url): ret = True session = DBSession() record = Action(name, url) session.add(record) try: session.commit() session.close() logging.info('add action success<name=%s, url=%s>' % (name, url)) except IntegrityError as error: logging.error(str(error)) ret = False return ret
def delete_blog_api(): user = g.get('user', None) blog_id = request.args.get('id', '') if user is None: return redirect(url_for('login')) else: if user.admin: sess = DBSession() blog = sess.query(Blogs).filter(Blogs.id == blog_id).delete() sess.commit() sess.close() else: return redirect(url_for('login')) return 'ok'
def create_patient(): content = request.get_json(silent=True) session = DBSession() session.expire_on_commit = False patient = Patient() if content and ("mac_address" in content): patient.mac_address = content["mac_address"].upper() stmt = session.add(patient) print(patient.id) session.commit() id = patient.id print(patient.id) session.close() return jsonify(patient.id)
def updateProfile(user,data): mySession = DBSession() try: transaction.begin() mySession.query(User).filter_by(user_name = user).update({"user_fullname": data["user_fullname"],"user_organization": data["user_organization"], "user_email": data["user_email"],"user_cnty": data["user_cnty"],"user_sector": data["user_sector"], "user_about": data["user_about"]}) transaction.commit() mySession.close() return True except Exception, e: print str(e) transaction.abort() mySession.close() return False
def addUser(userData): mySession = DBSession() newUser = User(userData["user_name"],userData["user_fullname"],encodeData(userData["user_password"]),userData["user_organization"],userData["user_email"], str(uuid.uuid4()),userData["user_cnty"],userData["user_sector"],"") try: transaction.begin() mySession.add(newUser) #Add the ne user to MySQL transaction.commit() mySession.close() return True,"" except Exception, e: transaction.abort() mySession.close() return False,str(e)
def vote_article(self, artId): data = { "userId": self.userId, "accessToken": self.accessToken, "artId": artId } r = requests.post(VOTE_ARTICLE_API, data, verify=False) result = r.json() vote = VoteRecord(artId=artId, result=result['res'], message=result['resMsg']) session = DBSession() session.add(vote) session.commit() session.close()
def save_to_log(from_who='user', message_type=None, message=None, comment_text='', msg_text=''): """Сохранить в лог. Внимательно передавать from_who from_who - 'bot', 'user', 'system'. От кого сообщение message - тип message. Сообщение от пользователя. comment_text - дополнительный текст. msg_text - текст сообщения. Использовать для сохранения ответа бота на message пользователя Примеры. save_to_log('user', message) - сохранить сообщение от пользователя. save_to_log('system', comment_text=err_text) - сохранить сообщение от системы. Например, об ошибке. save_to_log('bot', message=message_from_user, msg_text=bot_msg_text) - сохранить сообщение от бота пользоателю. """ if from_who not in ('bot', 'user', 'system'): comment_text += ' ' + from_who from_who = 'need_help' operation = None tid = None session = DBSession() if message: tid = message.from_user.id if from_who == 'user': if message.content_type == 'text': msg_text = message.text if message.content_type == 'contact': msg_text = str(message.contact) operation = session.query(Operation).filter_by(telegramid=tid).first() if operation is None: operation = Operation() log = Log(datetime=datetime.datetime.now(), from_who=from_who, user_id=tid, msg_text=msg_text, msg_type=message_type, operation=operation.current_operation, status=operation.operation_status, additional_info=operation.additional_info_db, function=inspect.stack()[1][3], comment=comment_text) session.add(log) session.commit() session.close()
def delete(cls, id): ret = True session = DBSession() actions = session.query(Action).filter(Action.id == id).all() if actions: try: session.delete(actions[0]) session.commit() session.close() except IntegrityError as error: logging.error(error) ret = False else: logging.error('Action not found') ret = False return ret
def crawl_problem_plats(): """ 问题平台 https://shuju.wdzj.com/problem-1.html """ url = "https://shuju.wdzj.com/problem-list-all.html" params = {"year": ""} response = requests.get(url, params=params, headers=HEADERS) json_data = response.json() problem_plats = json_data.get('problemList') for problem_plat in problem_plats: session = DBSession() plat_id = problem_plat.get('platId') wdzj_id = problem_plat.get('wdzjPlatId') plat_name = problem_plat.get('platName') if wdzj_id != 0: session.execute( """ INSERT INTO products (plat_id, wdzj_id, name) select '{plat_id}', '{wdzj_id}', '{plat_name}' WHERE not EXISTS (SELECT * FROM products WHERE plat_id = '{plat_id}'); """.format( plat_id=plat_id, wdzj_id=wdzj_id, plat_name=plat_name ) ) new_problem_plat = ProblemPlat( plat_id=problem_plat.get('platId'), # plat_id wdzj_id=problem_plat.get('wdzjPlatId'), # wdzj_id plat_name=problem_plat.get('platName'), # plat_name area=problem_plat.get('area'), # 地区 oneline_time=problem_plat.get('onlineTime'), # 上线时间 problem_date=problem_plat.get('problemTime'), # 问题时间 event_type=problem_plat.get('type'), # 事件类型 people_num=problem_plat.get('peopleNumber'), status1=problem_plat.get('status1'), # 保留字段status1 status2=problem_plat.get('status2') # 保留字段status2 ) session.add(new_problem_plat) session.commit() session.close()
def authenticate_api(): email = request.form['email'].encode('utf8') passwd = request.form['passwd'].encode('utf8') sess = DBSession() try: user = sess.query(Users).filter(Users.email == email).one() except: return 'emailError' sess.close() s = '%s:%s' % (email, passwd) passwd = hashlib.sha1(s.encode('utf8')).hexdigest() if passwd == user.passwd: cookie_str = '%s-%s-%s' % (user.id, user.email, user.passwd) L = [user.id, hashlib.sha1(cookie_str.encode('utf8')).hexdigest()] resp = make_response() resp.set_cookie('huusession', '-'.join(L)) return resp else: return 'passwordError'
def queryAllDesc(table_class, offset=None, limit=None): if offset is None and limit is None: sess = DBSession() qClass = sess.query(table_class).order_by( table_class.create_at.desc()).all() sess.close() else: sess = DBSession() qClass = sess.query(table_class).order_by( table_class.create_at.desc()).offset(offset).limit(limit).all() sess.close() for user in qClass: user.passwd = '******' qClass_list = [] for i in range(len(qClass)): qClass_list.append(qClass[i].to_dict()) return qClass_list
def getUserInfo(userid): mySession = DBSession() sql = "SELECT user_fullname, user_organization, user_email, user_about,lkpcountry.cnty_name,lkpsector.sector_name FROM " \ "user,lkpcountry,lkpsector WHERE user_cnty = lkpcountry.cnty_cod AND user_sector = lkpsector.sector_cod AND user_name = " + "'" + userid + "'" connection = mySession.connection() results = connection.execute(sql) userInfo = {} for result in results: userInfo["user_fullname"] = result[0]; userInfo["user_organization"] = result[1]; userInfo["user_email"] = result[2]; userInfo["user_about"] = result[3]; userInfo["cnty_name"] = result[4]; userInfo["sector_name"] = result[5]; connection.close() mySession.close() return userInfo
def create_comments_api(): user = g.get('user', None) if user is None: return 'notlogin' else: blog_id = request.form['blog_id'] user_id = request.form['user_id'] user_name = request.form['user_name'] user_image = request.form['user_image'] content = request.form['content'] sess = DBSession() sess.add( Comments(blog_id=blog_id, user_id=user_id, user_name=user_name, user_image=user_image, content=content)) sess.commit() sess.close() return 'ok'
def crawl_plat_detail(plat_id): """ 平台数据详情页(指数) https://www.wdzj.com/zhishu/detail-{plat_id}.html """ url = "https://www.wdzj.com/zhishu/detail-{plat_id}.html".format( plat_id=plat_id ) print("crawl plat {}".format(plat_id)) response = requests.get(url, headers=HEADERS) if response.status_code != 200: print('crawl failed: code: {}, url: {}'.format(response.status_code, url)) raise CrawlFailed('crawl failed!') # raise CrawlFailed('crawl failed!') encode_response(response) html = BeautifulSoup(response.text, features='lxml') x_html = etree.HTML(response.text) try: plat_name = x_html.xpath("//div[@class='title']/h1|h2")[0].text for div in html.select('.fr .xlist li div'): for child in div.children: if isinstance(child, Tag): child.extract() texts = list(reversed([div.text.strip() for div in html.select('.fr .xlist li div')])) except AttributeError as ex: print('crawl failed: ex: {}, url: {}'.format(str(ex), url)) raise except IndexError as ex: print('crawl failed: ex: {}, url: {}'.format(str(ex), url)) raise results = dict(zip(texts[0::2], texts[1::2])) trans_results = {} # 汉字转拼音 for k, v in results.items(): trans_results[''.join(lazy_pinyin(k))] = v trans_results['plat_id'] = plat_id trans_results['plat_name'] = plat_name new_detail = PlatDetail(**trans_results) session = DBSession() session.add(new_detail) session.commit() session.close()
def edit_blog(): blog_id = request.args.get('id', '') if blog_id: try: sess = DBSession() blog = sess.query(Blogs).filter(Blogs.id == blog_id).one() sess.close() except: blog = None else: return redirect(url_for('manage_blogs')) user = g.get('user', None) if user is None: return redirect(url_for('login')) else: if user.admin: return render_template('manage_blogs_edit.html', user=user, blog=blog) else: return redirect(url_for('login'))
def edit(cls, id, name=None, url=None): ret = True session = DBSession() action = session.query(Action).filter(Action.id == id).first() if not action: logging.error('not fount action<id=%s>' % id) return False if name: action.name = name if url: action.offset = url try: session.commit() logging.info('edit action success<id=%s, name=%s, url=%s>' % (action.id, action.name, action.url)) session.close() except IntegrityError as error: logging.error(str(error)) ret = False return ret
def get_blog(blog_id): try: sess = DBSession() blog = sess.query(Blogs).filter(Blogs.id == blog_id).one() sess.close() except: blog = None return 'can not find the blog' user = g.get('user', None) try: sess = DBSession() comments = sess.query(Comments).filter( Comments.blog_id == blog_id).all() sess.close() except: comments = None return render_template("blog.html", blog=blog, user=user, comments=comments)
def crawl_products(): url = "https://files.wdzjimages.com/shuju/product/search.json" print("crawl products...") response = requests.get(url) status = response.status_code if status != 200: print("crawl failed. (status is not 200)") raise CrawlFailed('crawl failed') products = response.json() for product in products: session = DBSession() new_product = Product( plat_id=product.get('platId'), name=product.get('platName'), old_name=product.get('oldPlatName'), pingyin=product.get('allPlatNamePin'), pin=product.get('autoPin') ) session.add(new_product) session.commit() session.close()
def crawl_problem_plats_first_letter(): """ 问题平台 https://shuju.wdzj.com/problem-1.html """ url = "https://shuju.wdzj.com/problem-list-all.html" params = {"year": ""} response = requests.get(url, params=params, headers=HEADERS) json_data = response.json() problem_plats = json_data.get('problemList') for problem_plat in problem_plats: plat_id = problem_plat.get('platId') wdzj_id = problem_plat.get('wdzjPlatId') first_letter = problem_plat.get('firstLetter') if wdzj_id != 0: session = DBSession() product = session.query(Product).filter_by(plat_id=plat_id).first() product.first_letter = first_letter session.commit() session.close()
def getUserLog(user,limit = True): sql = "SELECT DATE_FORMAT(DATE(log_datetime), '%%W %%D %%M %%Y') as log_date,TIME(log_datetime) as log_time,log_type,log_message,log_datetime as date1,log_datetime as date2 FROM activitylog WHERE log_user = '******' ORDER BY date1 DESC,date2 ASC,log_id desc" if limit: sql = sql + " LIMIT 20" mySession = DBSession() connection = mySession.connection() activities = connection.execute(sql) items = [] count = 1 for activity in activities: if count%2 == 0: alt = False else: alt= True count = count + 1 if activity[2] == "PRF": color = "terques" icon = "fa-user" else: if activity[2] == "MOD": color = "purple" icon = "fa-gears" else: if activity[2] == "FED": color = "blue" icon = "fa-leaf" else: if activity[2] == "API": color = "green" icon = "fa-bolt" else: color = "red" icon = "fa-bullhorn" items.append({"date":activity[0],"time":activity[1],"type":activity[2],"message":activity[3],"alt":alt,"icon":icon,"color":color}) connection.close() mySession.close() return items
def checkLogin(user,password): mySession = DBSession() result = mySession.query(userModel).filter_by(user_name = user).filter_by(user_active = 1).first() if result is None: mySession.close() return False else: cpass = decodeData(result.user_password) if cpass == password: mySession.close() return True else: mySession.close() return False
def crawl_question(url, cookie, scheduler): #crawl question page #like host: http://www.zhihu.com/question/123456 print 'start to question from url: ', url question_res = requests.get(url, cookies=cookie) with open('pages/question' + re.search(r'\d+', url).group() + '.html', 'wb') as question_file: question_file.write(question_res.content) session = DBSession() if question_res.status_code == 200: question_dom = BeautifulSoup(question_res.content) #find more question if question_dom.find("div", id="zh-question-related-questions"): all_related_ques = question_dom.find("div", id="zh-question-related-questions").find_all("a", class_="question_link") for ques in all_related_ques: ''' new_url = ques.get('href') if ques.get('href').startswith('http') \ else 'http://www.zhihu.com/' + ques.get('href') ''' if ques.get('href').startswith('http'): new_url = ques.get('href') elif ques.get('href').startswith('/'): new_url = 'http://www.zhihu.com' + ques.get('href') else: new_url = 'http://www.zhihu.com/' + ques.get('href') scheduler.add(new_url) #crawl data q_id = re.search(r'\d+', url).group() q_title = question_dom.find("div", id="zh-question-title").find('h2').contents[0].encode('utf-8') q_detail = '' if question_dom.find('div', id='zh-question-detail').find('textarea'): q_detail = question_dom.find('div', id='zh-question-detail').find('textarea').get_text().encode('utf-8') else: q_detail = question_dom.find('div', id='zh-question-detail').get_text().encode('utf-8') q_author = 'default' question = Question(q_id=q_id, title=q_title, author=q_author, content=q_detail) #crawl answer data #a_id author votes content last_modify answer_list = question_dom.find_all("div", class_="zm-item-answer") for answer in answer_list: a_id = answer.get('data-aid') author_h3 = answer.find("h3", class_="zm-item-answer-author-wrap") if author_h3.find_all('a') and len(author_h3.find_all('a'))>1: author = author_h3.find_all('a')[1].text.encode('utf-8') else: #匿名用户 author = author_h3.text.encode('utf-8') votes_div = answer.find("div", class_="zm-votebar") try: votes_span = votes_div.find_all("span") except: print votes_div sys.exit(1) votes = votes_span[1].text if len(votes_span)>1 else 0 ''' content = answer.find("div", class_="zm-editable-content").get_text().encode('utf-8') ''' content_div = answer.find("div", class_='zm-editable-content') if content_div is None: content = "None" else: content = content_div.get_text().encode('utf-8') #last modify date try: last_modify_date = answer.find("a", class_='answer-date-link').text.split()[1] if last_modify_date.index(':') != -1: last_modify_date = datetime.now().strftime("%Y-%m-%d") except: last_modify_date = "2015-06-17" last_modify = datetime.strptime(last_modify_date, "%Y-%m-%d") answer_info = Answer(a_id=a_id, author=author, votes=votes, content=content, last_modify=last_modify) session.add(answer_info) question.answers.append(answer_info) session.add(question) session.commit() session.close() #process_question(question) else: print "Error: ", str(question_res.status_code)
def update_links( rss_feed_url ): """Inserts new announcements into the database. Notes: * Publishing date is the date the property is listed for sale. It might be very old, * We insert all entries in the rss feed to the database. The url field is unique so duplicates are not allowed. * When querying new entries, keep in mind to query the date based on ( timestamp = 'today' & pubDate = 'close enough' )so only new listed properties are queried. Parameters ---------- rss_feed_url : str """ feed = feedparser.parse(rss_feed_url) entries = feed['entries'] num_new_links = 0 print('Updating liks database ..') session = DBSession() url_rs = session.query(Link.url) url_list = [url for (url,) in url_rs] session.close() session = DBSession() browser = logged_in_browser() for cnt, entry in enumerate(entries): link = entry['link'] published_str = entry['published'] print(cnt, ':', link) if link in url_list: print('duplicate url, passing..') continue published = date_parser.parse(published_str) pubDate = datetime.fromordinal(published.toordinal()) new_link = Link(url=link, date=pubDate) session.add(new_link) time.sleep(random.choice(range(20, 60))/10) try: data = crawl_hemnet_page(new_link.url, browser=browser) except Exception as e: print('Error crawling hemnet page.', e.message) continue new_apt = Apartment(**data) new_apt.link = new_link session.add(new_apt) try: session.commit() num_new_links += 1 except IntegrityError as e: print(e.message) print(link) session.rollback() except Exception as e: print(e.message) session.rollback() finally: session = DBSession() print('Done!') print('%s new links added.' % num_new_links)