예제 #1
0
파일: manager.py 프로젝트: ildap/parser
 def __init__(self):
     self.db = dbconnecter()
     if self.db.connect(self.data['postgres']):
         self.rutracker = Jumper(url='http://rutracker.org/forum/viewforum.php', proxies=None)
         self.kinopoisk = Jumper('http://www.kinopoisk.ru/index.php', None)
         self.reCompile()
         self.posterurl = self.data['poster']
         self.staticpath = self.data["static"]
         # self.rutracker.cookies = self.rutracker.post(url='http://login.rutracker.org/forum/login.php',logindata=self.data['login'])
         self.rutracker.setcookiejar_from_dict(self.data['cookies'])
         self.start()
예제 #2
0
파일: main.py 프로젝트: ildap/gks
import datetime

from scrapy.selector import Selector
import matplotlib.pyplot as plt
import aprox
from spider import Jumper

gks_ru = Jumper(url='http://www.gks.ru/dbscripts/cbsd/dbinet.cgi', proxies=None)
data = 'rdLayoutType=Au&_Pokazateli=on&_okato=on&_grtov=on&_god=on&_period=on&a_Pokazateli=1&a_okato=2&a_grtov=3&Qry=Pokazateli%3A1921002%3Bokato%3A80000000%3Bgrtov%3A1501%3Bgod%3A2011%2C2012%2C2013%2C2014%2C2015%3BperiodryGm=Pokazateli_z%3A1%3Bokato_z%3A2%3Bgrtov_z%3A3%3Bgod_s%3A1%3Bperiod_b%3A1%3B&QryFootNotes=%3B&YearsList=2011%3B2012%3B2013%3B2014%3B2015%3B&tbl=%CF%EE%EA%E0%E7%E0%F2%FC+%F2%E0%E1%EB%E8%F6%F3'
content = gks_ru.post(None,data,True)
#file = open('content.html')
#content = file.read()
content = content.decode('cp1251')
content = content.replace(u",", u'.')

xpath = '//table[@class="OutTbl"]/tr[1]/td/text()'
years = [int(td.extract()) for td in Selector(text=content).xpath(xpath)]
values = []
weeks = []
print years
xpath = '//table[@class="OutTbl"]/tr'
count = 0
param = 2
column = 0
for tr in Selector(text=content).xpath(xpath)[1:]:
    count += 1
    xpath = u'//td[{}]/text()'.format(param)
    tdn = Selector(text=tr.extract()).xpath(xpath).extract()[0]
    if tdn != u'\xa0':
        values.append(float(tdn))
    else:
예제 #3
0
파일: manager.py 프로젝트: ildap/parser
class Manager:
    path = os.path.dirname(os.path.abspath(__file__))
    data = ast.literal_eval(open(path + r'\data.txt').read())

    def __init__(self):
        self.db = dbconnecter()
        if self.db.connect(self.data['postgres']):
            self.rutracker = Jumper(url='http://rutracker.org/forum/viewforum.php', proxies=None)
            self.kinopoisk = Jumper('http://www.kinopoisk.ru/index.php', None)
            self.reCompile()
            self.posterurl = self.data['poster']
            self.staticpath = self.data["static"]
            # self.rutracker.cookies = self.rutracker.post(url='http://login.rutracker.org/forum/login.php',logindata=self.data['login'])
            self.rutracker.setcookiejar_from_dict(self.data['cookies'])
            self.start()
            # print self.db.get('*','ruparser_topic','1=1')

    def start(self):
        for i in range(0, 300, 50):

            args = {'f': 2093, 'start': i}
            print 'get pos', i,
            content = self.rutracker.jump(args)

            if content is not None:
                print 'ok'
                self.parseTopics(content)
                tm = datetime.now()
                tm = tm.strftime("%d.%m.%y_%H-%M")
                self._save(self.path + r'\html\topics{}_{}.html'.format(args['start'], str(tm)), content)

    def reCompile(self):
        self.resc = re.compile(r'GB', re.UNICODE)
        self.rescfl = re.compile(r'[0.0-9]+', re.UNICODE)
        self.rename = re.compile('[\S]+[^/^(]+', re.UNICODE)
        self.reyear = re.compile(r'(?<=\[)\d{4}(?!\d)', re.UNICODE)
        self.renum = re.compile(r'\d+', re.UNICODE)
        self.retime = re.compile(r'\d+:\d+')
        self.resub = re.compile(r'<wbr>', re.UNICODE)
        self.req = re.compile(r'\'', re.UNICODE)

    def getPoster(self, id):
        id = str(id)
        raw = self.kinopoisk.raw(self.posterurl + str(id) + '.jpg')
        dir = self.staticpath + id[0:2] + '/' + id[2:4] + '/' + id[4] + '/'
        try:
            os.stat(dir)
        except:
            os.makedirs(dir)
        file = open(dir + id[5] + '.jpg', 'wb')
        file.writelines(raw)

    def dbfind(self, name, year):
        where = " name like '" + name.encode('utf8') + "%' and year='" + str(year) + "'"
        fid = self.db.getID('ruparser_film', where)
        if fid:
            print 'was found in dbase id=', fid
            return fid
        else:
            args = {u'level': u'7',
                    u'first': u'yes',
                    u'from': u'forma',
                    u'result': u'adv',
                    u'm_act[from]': u'forma',
                    u'm_act[what]': u'content',
                    u'm_act[find]': name.encode('utf8'),
                    u'm_act[year]': year.encode('utf8')}
            content = self.kinopoisk.jump(args)
            # file = open('force.html')
            # content = file.read()
            if content is not None:
                return self.kinopars(name, year, content)

    def kinopars(self, name, year, content):
        content = content.decode('cp1251')
        content = content.encode('utf8')
        content = self.resub.sub('', content)

        def check(obj):
            if obj:
                return obj[0]
            else:
                return ''

        xpath = '//link[@rel="canonical"]/@href'
        id = Selector(text=content).xpath(xpath).extract()
        if id:
            id = self.renum.findall(id[0])[0]
        xpath = '//div[@class="brand_words"][@itemprop="description"]/text()'
        text = check(Selector(text=content).xpath(xpath).extract())
        text = self.req.sub('', text)

        xpath = '//span[@class="rating_ball"]/text()'
        rating = check(Selector(text=content).xpath(xpath).extract())
        if text and rating and id:
            print 'was found on kinopoisk.ru/film', id
            xpath = '//a[@class="popupBigImage"]/img/@src'
            poster = check(Selector(text=content).xpath(xpath).extract())
            if poster == 'http://st.kp.yandex.net/images/movies/poster_none.png':
                poster = u'false'
            else:
                poster = u'true'

            xpath = '//span[@class="ratingCount"]/text()'
            count = Selector(text=content).xpath(xpath).extract()

            if count:
                count = count[0]
                count = count.replace(u'\xa0', u'')
            else:
                count = 0;
            print 'rating', rating, count,
            xpath = '//td[@class="time"]/text()'
            time = Selector(text=content).xpath(xpath).extract()
            nulltime = '0:0'
            if len(time) > 1:
                time = self.retime.findall(time[1])
                if len(time) >= 1:
                    time = time[0]
                else:
                    time = nulltime

            elif len(time) == 1:
                time = self.renum.findall(time[0])
                if len(time) >= 1:
                    time = int(time[0])
                    th = time / 60
                    tm = time - (th * 60)
                    time = str(th) + ':' + str(tm)
                else:
                    time = nulltime
            else:
                time = nulltime

            print 'time', time,

            xpath = '//div[@id="block_rating"]/div[1]/div[2]/text()'
            imdb = check(Selector(text=content).xpath(xpath).extract())

            if imdb:
                imdb = float(self.rescfl.findall(imdb)[0])
            else:
                imdb = 0;
            print 'imdb:', imdb
            head = '(name,year,text,rating,count,imdb,time,kinopoiskid,poster)'
            values = (name.encode('utf8'), year, text.encode('utf8'), rating, count, imdb, time, id, poster)
            fid = self.db.insert('ruparser_film', head, values)
            return fid

    def kinosearch(self, fullname):
        try:
            name = self.rename.findall(fullname)[0]
            year = self.reyear.findall(fullname)[0]
            print 'search', name, year,
            return self.dbfind(name, year)
        except IndexError, er:
            print 'kinosearch', er
            return None