예제 #1
0
	def get(self):
		http = decorator.http()
		# Call the service using the authorized Http object.
		request = service.files().list()
		response = request.execute(http=http)
		resultlist = []
		for f in filter(lambda x:x['mimeType'] == 'application/pdf' and not x['labels']['trashed'],response['items']):
			downloadUrl = f.get('downloadUrl')
			if downloadUrl:
				logging.info('Request file %s' % f['title'])	
				resp, content = http.request(downloadUrl)
				if resp.status == 200:
					logging.info('Request successful')
					pdfobj = PdfFileReader(StringIO(content))
					f['numpages'] = pdfobj.getNumPages()
					resultlist.append(f)				
				else:
					logging.error('An error occured %s' % resp)
			else:
				logging.info('No download url for file %s' % f['title'])

		logging.info('User email: %s' % users.get_current_user().email())
		curUser = User.get_or_insert(users.get_current_user().email(),email=users.get_current_user().email())
		for f in resultlist:
			Doc.get_or_insert(f['id'],parent=curUser.key(),title=f['id'],totalPages=f['numpages'],user=curUser)

		template_values = {
			'filelist' : resultlist
		}
		template = jinja_environment.get_template('home.html')
		self.response.out.write(template.render(template_values))
예제 #2
0
 def post(self):
     json_data = request.get_json(force=True)
     if not json_data:
         return {'message': 'No input data provided'}, 400
     try:
         doc_data = doc_schema.load(json_data)
     except ValidationError as error:
         return {'message': error.messages}, 422
     duplicate = Doc.query.filter_by(slug=doc_data['slug'],
                                     car_id=doc_data['car_id'],
                                     part_id=doc_data['part_id']).first()
     if duplicate:
         return {
             'message': 'Уже есть документ с данным идентификатором'
         }, 409
     user_id = get_jwt_identity()
     doc = Doc(name=doc_data['name'],
               slug=doc_data['slug'],
               car_id=doc_data['car_id'],
               part_id=doc_data['part_id'],
               creator_id=user_id)
     db.session.add(doc)
     db.session.commit()
     doc_data = doc_schema.dump(doc)
     return {'status': 'success', 'data': doc_data}, 200
예제 #3
0
    def get_docs(self):
        doc_list = list()
        content = self.get_list_content(self.seed_url)
        result = json.loads(content)

        if result.get('entity', None) and len(result.get('entity', [])) > 0:
            for entity in result.get('entity'):
                if entity.get('sub_entity', None) and len(entity.get('sub_entity', [])) > 0:
                    for sub_entity in entity.get('sub_entity', None):
                        title = sub_entity.get("title", "")
                        detail_url = sub_entity.get("action", dict()).get("url", "")
                        snippet = sub_entity.get("snippet", "")
                        covers = sub_entity.get("cover", [])
                        cover = ""
                        if len(covers) > 0:
                            cover = covers[0].get('url', "")

                        author = ""

                        published_time = sub_entity.get("datePublished", int(time.time()) * 1000) / 1000

                        doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time)
                        CrawlerLogger.logger.info("get doc" + str(doc))
                        doc_list.append(doc)
        return doc_list
예제 #4
0
 def get_doc(self, docid):
     sql = 'select * from doc where docid=%s'
     sql_str = sql % docid
     CrawlerLogger.logger.info('get: {0}\n'.format(sql_str))
     MysqlLogger.logger.info('get: {0}\n'.format(sql_str))
     doc_row = self.db.get(sql, docid)
     if doc_row:
         return Doc.to_doc(doc_row)
     return None
예제 #5
0
    def get_docs(self):
        doc_list = list()
        try:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) "
                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
            }
            r = requests.get(self.seed_url, verify=False, headers=headers)
            if r.status_code == 200:
                html_text = r.text
                # print html_text
            else:
                CrawlerLogger.logger.info(
                    "request seed_url error {0} status: {1}".format(
                        self.seed_url, r.status_code))
                return doc_list
        except Exception as e:
            CrawlerLogger.logger.info("request seed_url error {0} {1}".format(
                self.seed_url, e))
            return doc_list

        results = json.loads(html_text)
        index = 0
        for item in results:
            CrawlerLogger.logger.info('--------------------' + str(index))
            index += 1

            title = item.get('title', "")
            detail_url = "https://zhuanlan.zhihu.com" + item.get('url', "")
            snippet = item.get('content', "")
            dr = re.compile(r'<[^>]+>', re.S)  # 去除html标签
            snippet = dr.sub('', snippet)[0:min(200, len(snippet))]
            cover = item.get('titleImage', "")
            author = item.get('author', dict()).get("name", "")
            published_str = item.get('publishedTime', "")
            try:  # 2016-05-22T21:09:03+08:00
                published_str = published_str.split('+')[0]
                published_time = int(
                    time.mktime(
                        time.strptime(published_str, "%Y-%m-%dT%H:%M:%S")))
            except Exception as e:
                published_time = time.time()
                CrawlerLogger.logger.info(
                    "get publish time error {0}, {1}".format(published_str, e))

            doc = Doc(self.source, title, detail_url, snippet, cover, author,
                      self.channel, published_time)
            CrawlerLogger.logger.info("get doc" + str(doc))
            doc_list.append(doc)
        return doc_list
예제 #6
0
파일: crud.py 프로젝트: li-lauren/JOT
def create_doc(url, title, publish_date, body, owner):
    """Create a document."""

    tz = pytz.timezone('America/Los_Angeles')
    created_at = datetime.now(tz)

    doc = Doc(url=url,
              title=title,
              publish_date=publish_date,
              body=body,
              owner=owner,
              created_at=created_at)

    db.session.add(doc)
    db.session.commit()

    return doc
예제 #7
0
    def get_docs(self, source=None, channel=None, start=0, page_size=10):
        sql = 'SELECT * FROM doc'

        wheres = list()
        if source:
            wheres.append("source='" + source + "'")
        if channel:
            wheres.append("channel='" + channel + "'")

        if len(wheres) > 0:
            sql += " WHERE " + " AND ".join(wheres)

        sql += " ORDER BY publishTime DESC LIMIT {0},{1}".format(start, page_size)
        CrawlerLogger.logger.info('get ' + sql)
        MysqlLogger.logger.info('get ' + sql)
        doc_rows = self.db.query(sql)
        docs = Doc.to_docs(doc_rows)
        return docs
예제 #8
0
    def get_doc_from_info(self, item, published_time):
        title = item.get('title', "")
        detail_url_tmp = "http://mp.weixin.qq.com" + item.get(
            'content_url', "")
        detail_content = self.get_detail_content(detail_url_tmp)
        if detail_content == "":
            return None
        detail_url = self.get_detail_url(detail_content)
        if detail_url == "":
            CrawlerLogger.logger.warn(
                "get detail url error {0}".format(detail_url_tmp))
            return None
        snippet = item.get('digest', "")
        cover = item.get('cover', "")
        author = item.get('author', "")

        doc = Doc(self.source, title, detail_url, snippet, cover, author,
                  self.channel, published_time)
        CrawlerLogger.logger.info("get doc" + str(doc))
        return doc
예제 #9
0
    def get_doc_from_content(self, content, detail_url):
        html = etree.HTML(content)
        title = util.get_text(html, '//title', True)  # False 时返回的是一个对象
        snippet = ''
        # cover = re.findall(r"var msgList = (.*)]};", html_text)
        cover_list = re.findall(r'data-src="(http.+?)"', content, re.I)
        cover = cover_list[len(cover_list)/2]
        tmp_meta = html.xpath('//em[@class="rich_media_meta rich_media_meta_text"]')
        detail_url = detail_url + '#rd'
        print detail_url
        if len(tmp_meta) > 1:
            author = tmp_meta[1].text
        else:
            author = util.get_text(html, '//em[@id="post-user"]', True)
        publish_time = tmp_meta[0].text
        time_struct = time.strptime(publish_time, "%Y-%m-%d")
        published_time = int(time.mktime(time_struct))
        # channel = tmp_meta[2]

        doc = Doc(self.source, title, detail_url, snippet, cover, author, self.channel, published_time)
        CrawlerLogger.logger.info("get doc" + str(doc))
        return doc
예제 #10
0
    def get_docs(self):
        doc_list = list()
        try:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) "
                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
            }
            r = requests.get(self.seed_url, headers=headers)
            if r.status_code == 200:
                html_text = r.text
            else:
                CrawlerLogger.logger.info(
                    "request seed_url error {0} status: {1}".format(
                        self.seed_url, r.status_code))
                return doc_list
        except Exception as e:
            CrawlerLogger.logger.info("request seed_url error {0} {1}".format(
                self.seed_url, e))
            return doc_list

        html = etree.HTML(html_text)
        channel = util.get_text(html, '//div[@class="block archive"]/h3',
                                True).strip()
        if channel and channel != "":
            CrawlerLogger.logger.info("get channel {0}".format(
                channel.encode('utf-8')))
            self.channel = channel
        result = html.xpath('//div[@class="block archive"]/div')
        # print type(result)
        # print len(result)
        CrawlerLogger.logger.info("result size: {0}".format(len(result)))
        index = 0
        for item in result[0:-1]:
            CrawlerLogger.logger.info('--------------------' + str(index))
            index += 1
            # print type(item)
            # print etree.tostring(item)

            title = util.get_text(item, './/h2/a/@title', False)
            detail_url = util.get_text(item, './/h2/a/@href', False)
            snippet = util.get_text(item, './/p', True)
            cover = util.get_text(item,
                                  './/div[@class="block-image"]/a/img/@src',
                                  False)
            author = util.get_text(item, './/span[@class="heading-author"]',
                                   True)
            published_str_raw = util.get_text(
                item, './/span[@class="heading-date"]', True)

            published_str = published_str_raw
            published_str = published_str.replace(u'十一月', '11')
            published_str = published_str.replace(u'十二月', '12')
            published_str = published_str.replace(u'一月', '01')
            published_str = published_str.replace(u'二月', '02')
            published_str = published_str.replace(u'三月', '03')
            published_str = published_str.replace(u'四月', '04')
            published_str = published_str.replace(u'五月', '05')
            published_str = published_str.replace(u'六月', '06')
            published_str = published_str.replace(u'七月', '07')
            published_str = published_str.replace(u'八月', '08')
            published_str = published_str.replace(u'九月', '09')
            published_str = published_str.replace(u'十月', '10')

            try:
                published_time = int(
                    time.mktime(time.strptime(published_str, "%m %d, %Y")))
            except Exception as e:
                published_time = time.time()
                CrawlerLogger.logger.info(
                    "get publish time error {0}, {1}".format(
                        published_str_raw.encode('utf-8'), published_str))
                print e

            doc = Doc(self.source, title, detail_url, snippet, cover, author,
                      self.channel, published_time)
            CrawlerLogger.logger.info("get doc" + str(doc))
            doc_list.append(doc)
        return doc_list
예제 #11
0
    def get_latest_doc(self, query_html_text):
        """
        直接从query页看到新最新的一篇文章
        :param query_html_text:
        :return:
        """
        html = etree.HTML(query_html_text)
        # print query_html_text
        tmp_detail_url = util.get_text(
            html, '//div[@class="news-box"]/ul/li/dl/dd/a/@href', False)
        CrawlerLogger.logger.info("tmp_detail_url: " + tmp_detail_url)

        detail_content = self.get_detail_content(tmp_detail_url, pause=False)
        if detail_content == "":
            return None
        detail_url = MediaPressAdapter.get_detail_url(detail_content)
        CrawlerLogger.logger.info("detail_url: " + detail_url)
        if len(detail_url) == 0:
            CrawlerLogger.logger.warning("error detail_url: " + detail_url)
            CrawlerLogger.logger.warning("detail_content: " + detail_content)
            return None

        finds = re.findall(r'var msg_title = "(.*)"', detail_content)
        if len(finds) == 0:
            CrawlerLogger.logger.warn("ger title str error" + detail_content)
            return None
        title = finds[0]
        CrawlerLogger.logger.info("title: " + title.encode('utf-8'))

        finds = re.findall(r'var msg_desc = "(.*)"', detail_content)
        if len(finds) == 0:
            CrawlerLogger.logger.warn("ger snippet str error" + detail_content)
            return None
        snippet = finds[0]
        CrawlerLogger.logger.info("snippet: " + snippet.encode('utf-8'))

        finds = re.findall(r'var msg_cdn_url = "(.*)"', detail_content)
        if len(finds) == 0:
            CrawlerLogger.logger.warn("ger snippet str error" + detail_content)
            return None
        cover = finds[0]
        CrawlerLogger.logger.info("cover: " + cover)

        finds = re.findall(r'var ct = "(.*)"', detail_content)
        if len(finds) == 0:
            CrawlerLogger.logger.warn("ger snippet str error" + detail_content)
            return None
        published_time = int(finds[0])
        CrawlerLogger.logger.info("published_time: " + str(published_time))

        content_html = etree.HTML(detail_content)
        author = util.get_text(
            content_html,
            '//em[@class="rich_media_meta rich_media_meta_text"][last()]',
            True)
        CrawlerLogger.logger.info("author: " + author)

        doc = Doc(self.source, title, detail_url, snippet, cover, author,
                  self.channel, published_time)

        CrawlerLogger.logger.info("ger latest doc succ" + str(doc))
        return doc