Exemplo n.º 1
0
def content_handle(movie):
    # movie_names = re.findall(rer1, movie, re.S)
    # print(movie_names)
    _id = list(
        set(re.findall('https://movie.douban.com/subject/(\\d*?)/', movie)))
    if len(_id) == 1:
        _id = _id[0]
    else:
        print(_id)
        raise Exception("!!!more than one!")
    comment_table_create(_id)
    db = DbHandle(database='comment')
    db.table = _id
    block = re.findall('id="comments".*?id="paginator"', movie, re.S)
    block = block[0]
    # soup = BeautifulSoup(movie, 'lxml')
    # comments = soup.find_all(class_='comment-item')
    pattern = 'title="(.*?)".*?"https://www.douban.com/people/(.*?)/".*?src="(.*?)".*?<.*?comment-time.*?title="(.*?)".*?short">(.*?)<'
    comments = re.findall(pattern, block, re.S)
    # x = 1
    for comment in comments[:10]:
        pic = re.sub('/u(.*?)-.*?\\.', '/ul\\1.', comment[2])
        data = [
            comment[1], comment[0],
            str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4])
        ]
        print(_id, comment[1], pic, comment[2],
              str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4]))
        db.save(data)
Exemplo n.º 2
0
 def get_urls(self):
     db = DbHandle(database='spider')
     self.db.table = 'ranking'
     _id_list = db.get(table='ranking', _range='movie_id')
     id_list = list(set([i[0] for i in _id_list]))
     done_ = db.get(table='spider_movie', _range='id')
     # print(done_)
     print(len(id_list))
     for i in done_:
         id_list.remove(i[0])
     print(len(id_list))
     # 处理评论
     # comments_urls = ['https://movie.douban.com/subject/%s/comments?status=P' % _id for _id in id_list]
     # 处理电影
     movies_urls = [
         'https://movie.douban.com/subject/%s/' % _id for _id in id_list
     ]
     # 处理获奖
     for m, i in zip(id_list, movies_urls):
         requests.get(
             i,
             hooks={
                 'response':
                 lambda r, *args, **kwargs: self.movie_handle(m, r.text)
             })
Exemplo n.º 3
0
def get_urls():
    db = DbHandle(database='ranking')
    db.table = 'movie'
    actor_ids = db.get(_range='details')
    urls = []
    for ids in actor_ids:
        if ids[0]:
            id_list = re.findall('celebrity/(\\d*?)/', str(ids[0]))
            url_list = [
                'https://movie.douban.com/celebrity/%s/' % _id
                for _id in id_list
            ]
            urls.extend(url_list)
        else:
            continue
    urls.remove('https://movie.douban.com/celebrity/1376098/')
    # db.table = 'person'
    # actor_ids = db.get(_range='id')
    # have_urls = []
    # for ids in actor_ids:
    #     url = 'https://movie.douban.com/celebrity/%s/' % ids
    #     have_urls.append(url)
    #
    # print(len(have_urls))
    # u = list(set(urls) ^ set(have_urls))
    # print(len(set(have_urls)), len(set(urls)))
    # print(len(u))
    # # u.remove('https://movie.douban.com/celebrity/1369442/')
    request = GUrlHandle(content_handle=content_handle, max_=400)
    request.get_contents(urls)
Exemplo n.º 4
0
def db_save(name, data):
    db = DbHandle()
    db.table = 'init'
    data_ = [name, data]
    try:
        db.save(data_)
    except Exception as e:
        print(e)
Exemplo n.º 5
0
def get_urls():
    db = DbHandle()
    _id = db.get(table='movie', _range='id')
    urls = []
    for i in _id:
        urls.append('https://movie.douban.com/subject/{}/'.format(*i))
    session = GUrlHandle(content_handle=content_handle, max_=200, use_id=True)
    session.get_contents(urls)
Exemplo n.º 6
0
def get_urls():
    db = DbHandle()
    db.table = 'movie'
    _id_list = db.get(_range='id')
    id_list = [i[0] for i in _id_list]
    urls = [
        'https://movie.douban.com/subject/%s/comments?status=P' % _id
        for _id in id_list
    ]
    request = GUrlHandle(content_handle=content_handle)
    request.get_contents(urls)
Exemplo n.º 7
0
def create_table():
    sql = '''CREATE TABLE `person`(
        `id` CHAR(15) PRIMARY KEY NOT NULL ,
        `name` CHAR(50) NOT NULL ,
        `sex` VARCHAR(5),
        `constellation` VARCHAR(10),
        `birthday` VARCHAR(30),
        `birthplace` VARCHAR(50) ,
        `profession` VARCHAR(60),
        `imdb` VARCHAR(15),
        `introduce` TEXT,
        )'''
    db = DbHandle()
    db.create_table(sql)
Exemplo n.º 8
0
def comment_table_create(_id, database=None):
    # db = pymysql.connect(host='localhost', user='******', password='******', database='ranking')
    db = DbHandle(database='comment')
    # cursor = db.cursor()
    sql = '''CREATE TABLE `%s`(
        `user_id` VARCHAR(30),
        `user_name` VARCHAR(30),
        `date` VARCHAR(30),
        `comment` TEXT
        )''' % _id
    try:
        db.execute(query=sql)
    except Exception as e:
        print(e)
Exemplo n.º 9
0
 def __init__(self):
     self.db = DbHandle()
     self.cursor = self.db.cursor
     self.fake = UserAgent()
Exemplo n.º 10
0
class Main(object):
    def __init__(self):
        self.db = DbHandle()
        self.cursor = self.db.cursor
        self.fake = UserAgent()

    def get_urls(self):
        db = DbHandle(database='spider')
        self.db.table = 'ranking'
        _id_list = db.get(table='ranking', _range='movie_id')
        id_list = list(set([i[0] for i in _id_list]))
        done_ = db.get(table='spider_movie', _range='id')
        # print(done_)
        print(len(id_list))
        for i in done_:
            id_list.remove(i[0])
        print(len(id_list))
        # 处理评论
        # comments_urls = ['https://movie.douban.com/subject/%s/comments?status=P' % _id for _id in id_list]
        # 处理电影
        movies_urls = [
            'https://movie.douban.com/subject/%s/' % _id for _id in id_list
        ]
        # 处理获奖
        for m, i in zip(id_list, movies_urls):
            requests.get(
                i,
                hooks={
                    'response':
                    lambda r, *args, **kwargs: self.movie_handle(m, r.text)
                })
        # request = GUrlHandle(content_handle=self.movie_handle, use_id=True)
        # request.get_contents(movies_urls)

    def movie_handle(self, _id, t):
        movie_id = _id
        print(movie_id)
        self.cursor.execute('USE `spider`')
        response = etree.HTML(t)
        # response = html.xpath('//a/@href')
        try:
            name = response.xpath(
                '//*[@id="content"]/h1/span[1]/text()')[0].split()[0]
        except:
            print(response.xpath('//*[@id="content"]/h1/text()'))
            return
        try:
            rank = re.findall('ratingValue": "(.*?)"', t, re.S)[0]
        except:
            rank = '0.0'
        if rank == '':
            rank = '0.0'
        try:
            star_num = re.findall('ratingCount": "(.*?)"', t, re.S)[0]
        except:
            star_num = None
        try:
            year = re.findall('datePublished": "(.*?)"', t, re.S)[0]
        except:
            year = None
        try:
            _class = json.dumps(json.loads(
                re.findall('genre": (\[.*?\])', t, re.S)[0]),
                                ensure_ascii=False)
        except:
            _class = None
        try:
            countries = re.findall('制片国家/地区:</span> (.*?)<', t)[0]
        except:
            countries = None
        try:
            long = response.xpath(
                '//*[@id="info"]/span[@property="v:runtime"]/text()')[0]
        except:
            long = None
        poster = response.xpath('//*[@id="mainpic"]/a/img/@src')[0]
        pic = movie_id + '.jpg'
        pic_path = os.path.join('movie', pic)
        with open(os.path.join(SOURCE, pic_path), 'wb') as f:
            f.write(
                requests.get(poster, {
                    'User-Agent': self.fake.random
                }).content)
        poster = pic_path
        review = None
        content = ''
        contents = response.xpath('//span[@class="all hidden"]/text()')
        if contents:
            for i in contents:
                content += i.strip()
        else:
            for i in response.xpath('//span[@property="v:summary"]/text()'):
                content += i.strip()
        details = content
        # 对电影剧照的收集
        images = response.xpath('//*[@id="related-pic"]/ul/li')
        image = []
        for image_ in images:
            img = image_.xpath('./a/img/@src')
            image.extend(img)
        _image = []
        for n, i in enumerate(image):
            pic = movie_id + '_' + str(n) + '.jpg'
            pic_path = os.path.join('movie', pic)
            with open(os.path.join(SOURCE, pic_path), 'wb') as f:
                f.write(
                    requests.get(i, {
                        'User-Agent': self.fake.random
                    }).content)
            _image.append(pic_path.replace('\\', '/'))
        image = json.dumps(_image)
        if not self.cursor.execute(
                'SELECT `movie_name` FROM `spider_movie` WHERE id=%s', _id):
            self.cursor.execute(
                '''
            INSERT INTO `spider_movie`(
            `movie_name`,
            `long`,
            `rank`,
            `star_num`,
            `year`,
            `class`,
            `countries`,
            `id`,
            `review`,
            `details`,
            `poster`,
            `image`
            ) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ''', [
                    name, long, rank, star_num, year, _class, countries, _id,
                    review, details, poster, image
                ])
            self.db.commit()
            print([
                name, long, rank, star_num, year, _class, countries, _id,
                review, details, poster, image
            ])
Exemplo n.º 11
0
import requests
from scrapy.handle_db.DBApi import DbHandle
from scrapy.requests.g_handle import GUrlHandle
from lxml import etree

db = DbHandle()

url = 'https://movie.douban.com/subject/1292052/awards/'


def get_awards(url):
    sessions = requests.session()
    sessions.headers[
        'User-Agent'] = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
    r = sessions.get(url)
    r.encoding = 'utf-8'
    text = r.text
    awards_handle(text)


def get_url():
    l = db.get(table='movie', _range='(`id`, `awards_url`)')
    for i in l:
        get_content(i[0], hook=awards_handle)


def awards_handle(text):
    content = etree.HTML(text)
    d = content.xpath("//div[@class='awards']")
    num = len(d)
    data_list = []
Exemplo n.º 12
0
def content_handle(info):
    # all_actor_information = []
    _id = re.findall(
        'id="headline".*?rel="nofollow".*?https://movie.douban.com/celebrity/(\d*?)/',
        info, re.S)
    data = [_id[0]]
    name = re.findall(r'<div id="content">.*?<h1>(.+)</h1>', info, re.S)[0]
    try:
        sex = re.findall(r'<span>性别<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find actor sex')
        sex = None
    try:
        constellation = re.findall(r'<span>星座<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find constellation')
        constellation = None
    try:
        birthday = re.findall(r'<span>出生日期<.+>:\s*(.*)\s*', info)[0]
    except Exception as e:
        try:
            birthday = re.findall(r'<span>生卒日期<.+>:\s*(.*)\s*', info)[0]
        except:
            print('Can not find birthday')
            birthday = None
    try:
        birthplace = re.findall(r'<span>出生地<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find birthplace')
        birthplace = None
    try:
        profession = re.findall(r'<span>职业<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find profession')
        profession = None
    try:
        imdb_number = re.findall(r'<span>imdb编号<.+>:\s*.+>(.+)</a>', info)[0]
    except:
        print('Can not find IMDB编号')
        imdb_number = None
    all_introduce = re.findall(r'<span class="all hidden">\s*(.+)<', info)
    if not bool(all_introduce):
        normal_introduce = re.findall(
            r'<h2>\s*影人简介\s*.+\s*<.+>\s*</div>\s*<div class="bd">\s*(.+)\s*',
            info)
        _dict = {
            "姓名": name,
            "性别": sex,
            "星座": constellation,
            "出生日期": birthday,
            "出生地": birthplace,
            "职业": profession,
            "imdb编号": imdb_number,
            "简介": normal_introduce[0]
        }
        # all_actor_information.append(dict)
    else:
        _dict = {
            "姓名": name,
            "性别": sex,
            "星座": constellation,
            "出生日期": birthday,
            "出生地": birthplace,
            "职业": profession,
            "imdb编号": imdb_number,
            "简介": all_introduce[0]
        }
        # all_actor_information.append(dict)
    # print(_dict)
    data.extend(_dict.values())
    if data[-1] == '</div>':
        data[-1] = None
    db = DbHandle()
    db.table = 'movie_person'
    if not db.get_by_id(_id=int(_id[0])):
        print(data)
        db.save(data)
    else:
        print('Already have this')
Exemplo n.º 13
0
 def __init__(self):
     self.db = DbHandle(database='spider')
     self.cursor = self.db.cursor
     self.fake = UserAgent()
Exemplo n.º 14
0
class Main(object):
    def __init__(self):
        self.db = DbHandle(database='spider')
        self.cursor = self.db.cursor
        self.fake = UserAgent()

    def get_urls(self):
        self.db.table = 'ranking'
        _id_list = self.db.get(_range='movie_id')
        id_list = list(set([i[0] for i in _id_list]))
        self.cursor.execute('use `spider_comment`')
        self.cursor.execute('show tables')
        tables = self.cursor.fetchall()
        for i in tables:
            try:
                id_list.remove(i[0])
            except ValueError as e:
                id_list.append(i[0])
        # 处理评论
        comments_urls = [
            'https://movie.douban.com/subject/%s/comments?status=P' % _id
            for _id in id_list
        ]
        print(comments_urls)
        # 处理电影
        movies_urls = [
            'https://movie.douban.com/subject/%s/' % _id for _id in id_list
        ]
        # 处理获奖
        for m, i in zip(id_list, comments_urls):
            requests.get(
                i,
                hooks={
                    'response':
                    lambda r, *args, **kwargs: self.comments_handle(m, r.text)
                })
        # request = GUrlHandle(content_handle=self.comments_handle, use_id=True)
        # request.get_contents(comments_urls)

    def comments_handle(self, _id, text):
        # html = etree.HTML(text)
        # t = html.xpath('//a/@href')
        self.cursor.execute('USE `spider_comment`')
        movie_id = _id
        self.cursor.execute('SHOW TABLES')
        all_table = self.cursor.fetchall()
        if (movie_id, ) not in all_table:
            try:
                s = 'CREATE TABLE ' + '`' + movie_id + '`'
                self.cursor.execute('''
                        {}(
                        `user_id` VARCHAR(30),
                        `user_name` VARCHAR(30),
                        `comment_time` DATETIME,
                        `comment` TEXT,
                        `image` VARCHAR(50)
                        )
                        '''.format(s))
            except Exception as e:
                print(e)

            comments = re.findall(
                'class="avatar".*?title="(.*?)".*?"https://www.douban.com/people/(.*?)/".*?src="(.*?)".*?<.*?comment-time.*?title="(.*?)".*?short">(.*?)<',
                text, re.S)
            for comment_ in comments:
                # [comment[1], comment[0], str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4])]
                user_id = comment_[1]
                user_name = comment_[0]
                comm_time = str(comment_[3]).replace('\n', '')
                comm = comment_[4]
                image = re.sub('/u(.*?)-.*?\\.', '/ul\\1.', comment_[2])
                pic = user_id + '.jpg'
                pic_path = os.path.join('user', pic)
                with open(os.path.join(SOURCE, pic_path), 'wb') as f:
                    f.write(
                        requests.get(image, {
                            'User-Agent': self.fake.random
                        }).content)
                image = pic_path
                sql = 'INSERT INTO ' + '`' + movie_id + '`'
                self.cursor.execute(
                    sql +
                    '(`user_id`, `user_name`, `comment_time`, `comment`,`image`) VALUES(%s, %s, %s, %s, %s)',
                    [user_id, user_name, comm_time, comm, image])
                self.db.commit()
                print(user_id, user_name, comm_time, comm, image)
Exemplo n.º 15
0
def content_handle(movie_id, content):
    block = re.findall('type="application/ld\\+json".*?datePublished', content, re.S)
    # print(block)
    actors = re.findall('celebrity/(\\d*?)/', block[0])
    db = DbHandle()
    db.table = 'movie_cast'
    for actor in actors:
        try:
            if not db.get(_filter='where person_id={} and movie_id={}'.format(actor, movie_id)):
                print(actor)
                db.save(data=[movie_id, actor], _range='movie_id, person_id')
            else:
                print(movie_id, actor, 'Already have this connect')
        except Exception as e:
            # 桐本拓哉,你狠
            if actor == '1376098':
                db.save(data=[movie_id, '1250852'], _range='movie_id, person_id')
                continue
            db.close()
            print(e)


            extra(actor)
            print(actor)
            db = DbHandle()
            db.table = 'movie_cast'
            db.save(data=[movie_id, actor], _range='movie_id, person_id')