Пример #1
0
def scrapy(keywords, filetype, endpage):
    """[summary]

    Arguments:
        endpage {[type]} -- 抓取总页数,推荐50
    """
    page = 1
    bing = BingBrowser()
    driver = bing.initDriver()
    while page < endpage:
        print(page)
        url = bing.generateDocQueryString(keywords, filetype, page)
        logging.info('抓取 %s' % url)
        driver.get(url)
        try:
            items = bing.fetchItmes(url)
        except Exception as ex:
            logging.error('抓取url发生错误')
            continue
        mongodb = Mongodb(configure.read('mongodb'))
        repeat = 0
        for item in items:
            title = item['title']
            link = item['link']
            doc = Doc()
            doc.title = title
            doc.source_url = link
            doc.keywords = keywords
            doc.object_key = str(uuid.uuid4())
            doc.url_hash = stringMd5(link)
            saveDocRes = doc.addDoc()
            if saveDocRes == False:
                # 已抓取过的链接
                logging.warning('该网页已处理过')
                repeat = repeat + 1
                if repeat == 10:
                    logging.warning('单页有10条已重复')
                continue
        page = page + 1
    # driver.get_screenshot_as_file('test.png')
    bing.closeBrowser()
Пример #2
0
import logging
import coloredlogs
from peewee import BooleanField, MySQLDatabase, CharField, IntegerField, FloatField, Model, DateTimeField
from configs.config import configure
import datetime

coloredlogs.install()
configure.load('app.yml')
mysqlConfig = configure.read('mysql')

db = MySQLDatabase('sp-blog',
                   host=mysqlConfig['host'],
                   user=mysqlConfig['user'],
                   password=mysqlConfig['pwd'],
                   port=mysqlConfig['port'])


class BaseModel(Model):
    class Meta:
        database = db


class Movie(BaseModel):
    title = CharField()
    filename = CharField()
    douban_praise = FloatField()
    storage_nums = FloatField()
    storage_unit = CharField()
    bdid = CharField()
    doubanid = CharField()
    down_link = CharField()
Пример #3
0
import coloredlogs
import logging
from utils.oss import Oss
from configs.config import configure
import uuid
from utils.Bing import BingBrowser
from utils.db import Doc, Mongodb
from utils.func import *
import os
import click
# 创建 日志 对象
coloredlogs.install(milliseconds=True)

# 初始化oss
configure.load('app.yml')
ossConfig = configure.read('oss')
oss = Oss(ossConfig)
oss.auth()


@click.command()
@click.option('--keywords', prompt='请输入搜索的关键字', help='搜索的关键字')
@click.option('--filetype', prompt='文件类型', help='文件类型')
@click.option('--endpage', default=50, prompt='抓去总页数', help='抓取总页数')
@print_run_time
def scrapy(keywords, filetype, endpage):
    """[summary]

    Arguments:
        endpage {[type]} -- 抓取总页数,推荐50
    """
Пример #4
0
from utils.PdfParse import Pypdf
from configs.config import configure
import uuid
import coloredlogs
import logging
from utils.db import Doc, Mongodb
from utils.func import *
from utils.oss import Oss
import os

configure.load('app.yml')
downloadConfig = configure.read('download')
downpath = downloadConfig['path']
mongodb = Mongodb(configure.read('mongodb'))
# 创建 日志 对象
coloredlogs.install(milliseconds=True)

# 初始化oss
configure.load('app.yml')
ossConfig = configure.read('oss')
oss = Oss(ossConfig)
oss.auth()
# 下载连接
@print_run_time
def parseFile():
    pipline = [
        {"$match": {"status": 1}},
        {"$limit": 50}
    ]
    unParseDocscursor = Doc.objects().aggregate(*pipline)