def scrapy(keywords, filetype, endpage): """[summary] Arguments: endpage {[type]} -- 抓取总页数,推荐50 """ page = 1 bing = BingBrowser() driver = bing.initDriver() while page < endpage: print(page) url = bing.generateDocQueryString(keywords, filetype, page) logging.info('抓取 %s' % url) driver.get(url) try: items = bing.fetchItmes(url) except Exception as ex: logging.error('抓取url发生错误') continue mongodb = Mongodb(configure.read('mongodb')) repeat = 0 for item in items: title = item['title'] link = item['link'] doc = Doc() doc.title = title doc.source_url = link doc.keywords = keywords doc.object_key = str(uuid.uuid4()) doc.url_hash = stringMd5(link) saveDocRes = doc.addDoc() if saveDocRes == False: # 已抓取过的链接 logging.warning('该网页已处理过') repeat = repeat + 1 if repeat == 10: logging.warning('单页有10条已重复') continue page = page + 1 # driver.get_screenshot_as_file('test.png') bing.closeBrowser()
import logging import coloredlogs from peewee import BooleanField, MySQLDatabase, CharField, IntegerField, FloatField, Model, DateTimeField from configs.config import configure import datetime coloredlogs.install() configure.load('app.yml') mysqlConfig = configure.read('mysql') db = MySQLDatabase('sp-blog', host=mysqlConfig['host'], user=mysqlConfig['user'], password=mysqlConfig['pwd'], port=mysqlConfig['port']) class BaseModel(Model): class Meta: database = db class Movie(BaseModel): title = CharField() filename = CharField() douban_praise = FloatField() storage_nums = FloatField() storage_unit = CharField() bdid = CharField() doubanid = CharField() down_link = CharField()
import coloredlogs import logging from utils.oss import Oss from configs.config import configure import uuid from utils.Bing import BingBrowser from utils.db import Doc, Mongodb from utils.func import * import os import click # 创建 日志 对象 coloredlogs.install(milliseconds=True) # 初始化oss configure.load('app.yml') ossConfig = configure.read('oss') oss = Oss(ossConfig) oss.auth() @click.command() @click.option('--keywords', prompt='请输入搜索的关键字', help='搜索的关键字') @click.option('--filetype', prompt='文件类型', help='文件类型') @click.option('--endpage', default=50, prompt='抓去总页数', help='抓取总页数') @print_run_time def scrapy(keywords, filetype, endpage): """[summary] Arguments: endpage {[type]} -- 抓取总页数,推荐50 """
from utils.PdfParse import Pypdf from configs.config import configure import uuid import coloredlogs import logging from utils.db import Doc, Mongodb from utils.func import * from utils.oss import Oss import os configure.load('app.yml') downloadConfig = configure.read('download') downpath = downloadConfig['path'] mongodb = Mongodb(configure.read('mongodb')) # 创建 日志 对象 coloredlogs.install(milliseconds=True) # 初始化oss configure.load('app.yml') ossConfig = configure.read('oss') oss = Oss(ossConfig) oss.auth() # 下载连接 @print_run_time def parseFile(): pipline = [ {"$match": {"status": 1}}, {"$limit": 50} ] unParseDocscursor = Doc.objects().aggregate(*pipline)