def get(self): http = decorator.http() # Call the service using the authorized Http object. request = service.files().list() response = request.execute(http=http) resultlist = [] for f in filter(lambda x:x['mimeType'] == 'application/pdf' and not x['labels']['trashed'],response['items']): downloadUrl = f.get('downloadUrl') if downloadUrl: logging.info('Request file %s' % f['title']) resp, content = http.request(downloadUrl) if resp.status == 200: logging.info('Request successful') pdfobj = PdfFileReader(StringIO(content)) f['numpages'] = pdfobj.getNumPages() resultlist.append(f) else: logging.error('An error occured %s' % resp) else: logging.info('No download url for file %s' % f['title']) logging.info('User email: %s' % users.get_current_user().email()) curUser = User.get_or_insert(users.get_current_user().email(),email=users.get_current_user().email()) for f in resultlist: Doc.get_or_insert(f['id'],parent=curUser.key(),title=f['id'],totalPages=f['numpages'],user=curUser) template_values = { 'filelist' : resultlist } template = jinja_environment.get_template('home.html') self.response.out.write(template.render(template_values))
def post(self): json_data = request.get_json(force=True) if not json_data: return {'message': 'No input data provided'}, 400 try: doc_data = doc_schema.load(json_data) except ValidationError as error: return {'message': error.messages}, 422 duplicate = Doc.query.filter_by(slug=doc_data['slug'], car_id=doc_data['car_id'], part_id=doc_data['part_id']).first() if duplicate: return { 'message': 'Уже есть документ с данным идентификатором' }, 409 user_id = get_jwt_identity() doc = Doc(name=doc_data['name'], slug=doc_data['slug'], car_id=doc_data['car_id'], part_id=doc_data['part_id'], creator_id=user_id) db.session.add(doc) db.session.commit() doc_data = doc_schema.dump(doc) return {'status': 'success', 'data': doc_data}, 200
def get_docs(self): doc_list = list() content = self.get_list_content(self.seed_url) result = json.loads(content) if result.get('entity', None) and len(result.get('entity', [])) > 0: for entity in result.get('entity'): if entity.get('sub_entity', None) and len(entity.get('sub_entity', [])) > 0: for sub_entity in entity.get('sub_entity', None): title = sub_entity.get("title", "") detail_url = sub_entity.get("action", dict()).get("url", "") snippet = sub_entity.get("snippet", "") covers = sub_entity.get("cover", []) cover = "" if len(covers) > 0: cover = covers[0].get('url', "") author = "" published_time = sub_entity.get("datePublished", int(time.time()) * 1000) / 1000 doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time) CrawlerLogger.logger.info("get doc" + str(doc)) doc_list.append(doc) return doc_list
def get_doc(self, docid): sql = 'select * from doc where docid=%s' sql_str = sql % docid CrawlerLogger.logger.info('get: {0}\n'.format(sql_str)) MysqlLogger.logger.info('get: {0}\n'.format(sql_str)) doc_row = self.db.get(sql, docid) if doc_row: return Doc.to_doc(doc_row) return None
def get_docs(self): doc_list = list() try: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" } r = requests.get(self.seed_url, verify=False, headers=headers) if r.status_code == 200: html_text = r.text # print html_text else: CrawlerLogger.logger.info( "request seed_url error {0} status: {1}".format( self.seed_url, r.status_code)) return doc_list except Exception as e: CrawlerLogger.logger.info("request seed_url error {0} {1}".format( self.seed_url, e)) return doc_list results = json.loads(html_text) index = 0 for item in results: CrawlerLogger.logger.info('--------------------' + str(index)) index += 1 title = item.get('title', "") detail_url = "https://zhuanlan.zhihu.com" + item.get('url', "") snippet = item.get('content', "") dr = re.compile(r'<[^>]+>', re.S) # 去除html标签 snippet = dr.sub('', snippet)[0:min(200, len(snippet))] cover = item.get('titleImage', "") author = item.get('author', dict()).get("name", "") published_str = item.get('publishedTime', "") try: # 2016-05-22T21:09:03+08:00 published_str = published_str.split('+')[0] published_time = int( time.mktime( time.strptime(published_str, "%Y-%m-%dT%H:%M:%S"))) except Exception as e: published_time = time.time() CrawlerLogger.logger.info( "get publish time error {0}, {1}".format(published_str, e)) doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time) CrawlerLogger.logger.info("get doc" + str(doc)) doc_list.append(doc) return doc_list
def create_doc(url, title, publish_date, body, owner): """Create a document.""" tz = pytz.timezone('America/Los_Angeles') created_at = datetime.now(tz) doc = Doc(url=url, title=title, publish_date=publish_date, body=body, owner=owner, created_at=created_at) db.session.add(doc) db.session.commit() return doc
def get_docs(self, source=None, channel=None, start=0, page_size=10): sql = 'SELECT * FROM doc' wheres = list() if source: wheres.append("source='" + source + "'") if channel: wheres.append("channel='" + channel + "'") if len(wheres) > 0: sql += " WHERE " + " AND ".join(wheres) sql += " ORDER BY publishTime DESC LIMIT {0},{1}".format(start, page_size) CrawlerLogger.logger.info('get ' + sql) MysqlLogger.logger.info('get ' + sql) doc_rows = self.db.query(sql) docs = Doc.to_docs(doc_rows) return docs
def get_doc_from_info(self, item, published_time): title = item.get('title', "") detail_url_tmp = "http://mp.weixin.qq.com" + item.get( 'content_url', "") detail_content = self.get_detail_content(detail_url_tmp) if detail_content == "": return None detail_url = self.get_detail_url(detail_content) if detail_url == "": CrawlerLogger.logger.warn( "get detail url error {0}".format(detail_url_tmp)) return None snippet = item.get('digest', "") cover = item.get('cover', "") author = item.get('author', "") doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time) CrawlerLogger.logger.info("get doc" + str(doc)) return doc
def get_doc_from_content(self, content, detail_url): html = etree.HTML(content) title = util.get_text(html, '//title', True) # False 时返回的是一个对象 snippet = '' # cover = re.findall(r"var msgList = (.*)]};", html_text) cover_list = re.findall(r'data-src="(http.+?)"', content, re.I) cover = cover_list[len(cover_list)/2] tmp_meta = html.xpath('//em[@class="rich_media_meta rich_media_meta_text"]') detail_url = detail_url + '#rd' print detail_url if len(tmp_meta) > 1: author = tmp_meta[1].text else: author = util.get_text(html, '//em[@id="post-user"]', True) publish_time = tmp_meta[0].text time_struct = time.strptime(publish_time, "%Y-%m-%d") published_time = int(time.mktime(time_struct)) # channel = tmp_meta[2] doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time) CrawlerLogger.logger.info("get doc" + str(doc)) return doc
def get_docs(self): doc_list = list() try: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" } r = requests.get(self.seed_url, headers=headers) if r.status_code == 200: html_text = r.text else: CrawlerLogger.logger.info( "request seed_url error {0} status: {1}".format( self.seed_url, r.status_code)) return doc_list except Exception as e: CrawlerLogger.logger.info("request seed_url error {0} {1}".format( self.seed_url, e)) return doc_list html = etree.HTML(html_text) channel = util.get_text(html, '//div[@class="block archive"]/h3', True).strip() if channel and channel != "": CrawlerLogger.logger.info("get channel {0}".format( channel.encode('utf-8'))) self.channel = channel result = html.xpath('//div[@class="block archive"]/div') # print type(result) # print len(result) CrawlerLogger.logger.info("result size: {0}".format(len(result))) index = 0 for item in result[0:-1]: CrawlerLogger.logger.info('--------------------' + str(index)) index += 1 # print type(item) # print etree.tostring(item) title = util.get_text(item, './/h2/a/@title', False) detail_url = util.get_text(item, './/h2/a/@href', False) snippet = util.get_text(item, './/p', True) cover = util.get_text(item, './/div[@class="block-image"]/a/img/@src', False) author = util.get_text(item, './/span[@class="heading-author"]', True) published_str_raw = util.get_text( item, './/span[@class="heading-date"]', True) published_str = published_str_raw published_str = published_str.replace(u'十一月', '11') published_str = published_str.replace(u'十二月', '12') published_str = published_str.replace(u'一月', '01') published_str = published_str.replace(u'二月', '02') published_str = published_str.replace(u'三月', '03') published_str = published_str.replace(u'四月', '04') published_str = published_str.replace(u'五月', '05') published_str = published_str.replace(u'六月', '06') published_str = published_str.replace(u'七月', '07') published_str = published_str.replace(u'八月', '08') published_str = published_str.replace(u'九月', '09') published_str = published_str.replace(u'十月', '10') try: published_time = int( time.mktime(time.strptime(published_str, "%m %d, %Y"))) except Exception as e: published_time = time.time() CrawlerLogger.logger.info( "get publish time error {0}, {1}".format( published_str_raw.encode('utf-8'), published_str)) print e doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time) CrawlerLogger.logger.info("get doc" + str(doc)) doc_list.append(doc) return doc_list
def get_latest_doc(self, query_html_text): """ 直接从query页看到新最新的一篇文章 :param query_html_text: :return: """ html = etree.HTML(query_html_text) # print query_html_text tmp_detail_url = util.get_text( html, '//div[@class="news-box"]/ul/li/dl/dd/a/@href', False) CrawlerLogger.logger.info("tmp_detail_url: " + tmp_detail_url) detail_content = self.get_detail_content(tmp_detail_url, pause=False) if detail_content == "": return None detail_url = MediaPressAdapter.get_detail_url(detail_content) CrawlerLogger.logger.info("detail_url: " + detail_url) if len(detail_url) == 0: CrawlerLogger.logger.warning("error detail_url: " + detail_url) CrawlerLogger.logger.warning("detail_content: " + detail_content) return None finds = re.findall(r'var msg_title = "(.*)"', detail_content) if len(finds) == 0: CrawlerLogger.logger.warn("ger title str error" + detail_content) return None title = finds[0] CrawlerLogger.logger.info("title: " + title.encode('utf-8')) finds = re.findall(r'var msg_desc = "(.*)"', detail_content) if len(finds) == 0: CrawlerLogger.logger.warn("ger snippet str error" + detail_content) return None snippet = finds[0] CrawlerLogger.logger.info("snippet: " + snippet.encode('utf-8')) finds = re.findall(r'var msg_cdn_url = "(.*)"', detail_content) if len(finds) == 0: CrawlerLogger.logger.warn("ger snippet str error" + detail_content) return None cover = finds[0] CrawlerLogger.logger.info("cover: " + cover) finds = re.findall(r'var ct = "(.*)"', detail_content) if len(finds) == 0: CrawlerLogger.logger.warn("ger snippet str error" + detail_content) return None published_time = int(finds[0]) CrawlerLogger.logger.info("published_time: " + str(published_time)) content_html = etree.HTML(detail_content) author = util.get_text( content_html, '//em[@class="rich_media_meta rich_media_meta_text"][last()]', True) CrawlerLogger.logger.info("author: " + author) doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time) CrawlerLogger.logger.info("ger latest doc succ" + str(doc)) return doc