def save(self, url, author, title): entity = session.query(Topic).filter_by(url=url).first() if entity: logger.warning('topic已存在,id:%s' % entity.id) self.existed_cnt += 1 else: t = Topic(url=url, author=author, title=title) t.save() logger.info(t)
def parse_topic_page(self, url, html): """解析话题详情页面""" soup = BeautifulSoup(html, 'lxml') author = soup.find('small', class_='gray').a.text title = soup.title.text.split(' - V2EX')[0] description = soup.find('div', class_='topic_content').text entity = session.query(Topic).filter_by(url=url).first() if not entity: t = Topic(url=url, author=author, title=title, description=description) t.save() logger.info(t)
def _persist_detail_info(self, html, group_name, url): """获取帖子详情 """ if "机器人".encode() in html: logging.warn("{} 403.html".format(url)) self.cache.r_sadd("group:{}:403".format(group_name), url) return None topic = {} images = [] title = self.parser(self.__rules["detail_title_lg"], html, True) or self.parser( self.__rules["detail_title_sm"], html, True) if title is None: return None topic["title"] = title.strip() topic["url"] = url topic["crawled_at"] = time.strftime("%Y-%m-%d %H:%M:%S") topic["create_time"] = self.parser(self.__rules["create_time"], html, True) author = self.parser(self.__rules["detail_author"], html, True) topic["author"] = filter_emoji(author) content = "\n".join(self.parser(self.__rules["content"], html)) if content is not "": topic["content"] = filter_emoji(content) else: content = "\n".join(self.parser(self.__rules["content_text"], html)) topic["content"] = filter_emoji(content) images.extend(self.parser(self.__rules["images"], html)) if len(images) > 0: topic["images"] = ",".join(images) else: topic["images"] = "" topic["topic_id"] = re.findall(r"(\d+)", url)[0] # phone = re.findall(r'(1[3|5|7|8|][0-9]{8})', content) # topic['phone'] = '' if not phone else phone[0] # sns = re.findall(r'(微信|qq|QQ)号?(:|:|\s)?(\s)?([\d\w_一二两三四五六七八九零]{5,})', content) # topic['sns'] = '' if not sns else sns[0] # area = re.findall(r'((\d{1,3})(多)?[平|㎡])', content) # topic['area'] = '' if not area else ''.join(area[0]) # modle = re.findall(r'([\d一二两三四五六七八九][居室房]([123一二两三]厅)?([12一二两]厨)?([1234一二两三四]卫)?([12一二两]厨)?)', content) # topic['model'] = '' with mysql_db.atomic(): Topic.create(**topic) return topic
def get(self, topic_id): try: topic = Topic.get(Topic.topic_id == topic_id) topic_dict = model_to_dict(topic) topic_dict["images"] = topic_dict["images"].split(",") return jsonify(topic_dict) except: abort(404, message="Topic id {} is not found".format(topic_id))
def test_Topic(db_handle): """ Topic has: class Topic(db.Model): id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(256), nullable=True) questions = db.relationship("Question", back_populates="topic") """ ### Test creating topic = Topic(name='nametest') db_handle.session.add(topic) db_handle.session.commit() assert Topic.query.count() == 1 ### Check the name topic = Topic.query.filter_by(name='nametest').first() assert topic.name == 'nametest' ### Test deleting # add one more topic = Topic(name='addtest') db_handle.session.add(topic) db_handle.session.commit() # should be 2 now assert Topic.query.count() == 2 # delete topic = Topic.query.filter_by(name='addtest').first() db_handle.session.delete(topic) db_handle.session.commit() # deleted? assert Topic.query.count() == 1 # check that the first one is still there topic = Topic.query.first() assert topic.name == 'nametest'
def scheduler(url=None): for url, updated in [('http://' + request.full_path[10:], datetime.now())] if url else parse_list(): topic = db.session.query(Topic).filter(Topic.url == url).first() if topic and topic.updated == updated: continue title, published, body = parse_page(url) if topic: topic.title = title topic.body = body topic.updated = updated else: topic = Topic(url, title, published, updated, body) db.session.add(topic) db.session.commit() return 'ok'
from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from flask import url_for from db import Base, Topic, Category, Article engine = create_engine('sqlite:///compscicatalog.db?check_same_thread=False') Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) session = DBSession() AI = Topic(name='Artificial Intelligence', url='Artificial-Intelligence', image='AI.jpeg') session.add(AI) session.commit() supervised_learning = Category(name='Supervised Learning', url='Supervised-Learning', topic_id=1, image='supervised.png') session.add(supervised_learning) session.commit() neural_networks = Article(name='Neural Networks', category_id=1, content='Artificial neural networks (ANN) or ' + 'connectionist systems are computing ' + 'systems vaguely inspired by the biological ' + 'neural networks that constitute ' +
from db import db, Topic, Question, Answer, Comment, User, Quiz from datetime import datetime db.create_all() print('Database created!') print('Populating...') ## create Topic resource to db topic1 = Topic(name='Test_topic_name') db.session.add(topic1) db.session.commit() print('Added Topic') ## create User resource to db user1 = User(username='******', email='test_email', pw_hash='test_pw_hash') db.session.add(user1) db.session.commit() print('Added User') ## create Question resource to db question1 = Question( topic_id=1, # has to exist question_text='test_topic_id', image_src='test_img_link') db.session.add(question1) db.session.commit() print('Added Question') ## create Answer resource to db